From 45fe5256b3ba87899fda7f955abed6a6f1a59aa6 Mon Sep 17 00:00:00 2001 From: Brady Wyllie Date: Tue, 19 Aug 2025 13:35:32 +0000 Subject: [PATCH] init repo --- .gitignore | 1 + README.md | 53 ++++++++++++++++++++++++ includes/docs.js | 93 ++++++++++++++++++++++++++++++++++++++++++ includes/vars.js | 6 +++ workflow_settings.yaml | 5 +++ 5 files changed, 158 insertions(+) create mode 100644 .gitignore create mode 100644 README.md create mode 100644 includes/docs.js create mode 100644 includes/vars.js create mode 100644 workflow_settings.yaml diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..c2658d7 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +node_modules/ diff --git a/README.md b/README.md new file mode 100644 index 0000000..e06f05a --- /dev/null +++ b/README.md @@ -0,0 +1,53 @@ +# PPHE Dataform + +Dataform is a tool for managing SQL queries used in the (T)ransformation stage of an ETL/ELT pipeline. Dataform compiles basic SQL queries, allowing authors to focus on business logic instead of DML and DDL syntax. + +At PPHE, we use Dataform for all basic SQL transformations of raw data sources. Our Dataform repository is structured as follows: + +* definitions - main-level directory for all .SQLX (Dataform SQL) files + * gold - for transformations that write to gld_ datasets in BigQuery + * board - for custom Board reports + * looker - for custom Looker (Studio) reports + * warehouse - for business-critical tables that combine data from many sources + * sources - for declaration and transformation of raw data to cleaned staging tables + * e.g, opera - for data that originates from Opera + * Within each source folder, the following structure applies: + * raw - for raw source table declaration statements only + * staging - for essential transformations of source tables, including renaming and recasting of fields, unioning of raw tables that come from different versions of the source system, and capturing deleted or records + +The gold layer should be thought of as the cleanest final layer where any employee of the business may find useful data, and the staging layer should be thought of as a sandbox where analysts and engineers can begin building new analyses. + +Any time a new data source is (E)xtracted and (L)oaded to our GCP environment, it should flow through Dataform in the following order: + +1. As a new source directory at definitions/sources +2. As new table declarations for all tables at definitions/sources/{source_name}/raw +3. As staging transformations, with assertions to validate data quality, for all tables at definitions/sources/{source_name}/staging +4. As new tables and/or fields incorporated into the different gold layer destinations at definitions/gold + +## Dev Environments + +PPHE's data models are separated into two environments, production and development. *Production* includes finalized transformations that are safe to rely on for analysis and reporting. *Development* includes transformations that have not been completely validated yet and may still be in testing. + +To keep these environments isolated, all new Dataform transformations automatically write to Development. Only after undergoing code review and the CI/CD process do transformations get promoted to Production. + +Each Dataform has their own development environment to prevent collisions while working on new queries. Dev environments write to BigQuery in the following manner: + +* GCP Project: pphe-data-dev + * BQ Datasets: + * Sources: dev_{username}_src_{source_name}_{stg} + * e.g, dev_bwyllie_src_opera_stg + * Gold Tables: dev_{username}_gld_{destination_name} + * e.g, dev_bwyllie_gld_board + +These transformations also have a built-in `WHERE` clause to select just a small amount of data from the raw source tables. + +Once the transformations have been successfully reviewed and promoted to Prod, they write to BigQuery as so: + +* GCP Project: pphe-data-pro + * BQ Datasets: + * Sources: src_{source_name}_stg + * e.g, src_opera_stg + * Gold Tables: gld_{destination_name} + * e.g, gld_board + +New releases are deployed to Production on a weekly cadence to prevent excessive refreshing of large tables. \ No newline at end of file diff --git a/includes/docs.js b/includes/docs.js new file mode 100644 index 0000000..ee85c59 --- /dev/null +++ b/includes/docs.js @@ -0,0 +1,93 @@ +// Universal fields +const tenant_code = `The tenant/chain that the record belongs to`; +const property_code = `The property that the record belongs to`; +const export_insert_time = `Date and time the raw data record was inserted to BigQuery from Opera R&A`; +const staging_insert_time = `Date and time the staging data record was inserted from the raw data table`; +// Identifiers +const reservation_id = `Within a given property and tenant, identifier for the individual reservation`; +const reservation_product_id = `Within a given property and tenant, identifier for the individual reservation product` +const financial_transaction_id = `Within a given property and tenant, identifier for the individual transaction`; +const group_id = `Within a given property and tenant, identifier for the individual business group`; +const event_id = `Within a given property and tenant, identifier for the individual event`; +const profile_id = `Within a given property and tenant, identifier for the individual profile`; +const market_segment_code = `Market code`; +const group_profile_id = `Profile ID of the group`; +const travel_agent_profile_id = `Profile ID of the travel agent`; +const company_profile_id = `Profile ID of the company`; +const guest_profile_id = `Profile ID of the guest`; +const guest_country_code = `Country code of the guest`; +const booking_status_code = `Booking status`; +const booking_source_code = `Booking source`; +const block_code = `Block code`; +const rate_code = `Rate code`; +const transaction_code = `Transaction code`; +const reservation_status_code = `Reservation status`; +const room_category_code = `Room category`; +const booked_room_category_code = `Booked room category`; +const room_class_code = `Room class`; +const room_type_code = `Room type`; +const confirmation_number = `Confirmation number of the reservation`; +// Dimensions +const guest_country_name = 'Country name of the guest'; +const product_name = `Product/package code`; +const product_description = `Full description of the product/package`; +const group_description = `Full description/name of the group/block`; +// Dates and times +const considered_date = `Business Date that the data corresponds to`; +// Booleans +const is_meeting_room_flag = `Indicates whether the room is a meeting room`; +const is_pseudo_room_flag = `Indicates whether the room is a pseudo room`; +// Stats and metrics +const number_of_rooms = `Number of rooms`; +const room_nights = 'Total number of nights (across all rooms) for the reservation'; +const adults = `Number of adults`; +const children = `Number of children`; +const room_revenue = `Total net room revenue amount`; +const food_revenue = `Total net food and beverage revenue amount`; +const total_revenue = `Total net revenue amount`; +const other_revenue = `Total of net revenue amount that does not fall under room, food, or beverage categories`; + +module.exports = { + tenant_code, + property_code, + export_insert_time, + staging_insert_time, + reservation_id, + reservation_product_id, + financial_transaction_id, + group_id, + event_id, + profile_id, + market_segment_code, + group_profile_id, + travel_agent_profile_id, + company_profile_id, + guest_profile_id, + guest_country_code, + booking_status_code, + booking_source_code, + block_code, + rate_code, + transaction_code, + reservation_status_code, + room_category_code, + booked_room_category_code, + room_class_code, + room_type_code, + confirmation_number, + guest_country_name, + product_name, + product_description, + group_description, + considered_date, + is_meeting_room_flag, + is_pseudo_room_flag, + number_of_rooms, + room_nights, + adults, + children, + room_revenue, + food_revenue, + total_revenue, + other_revenue +} diff --git a/includes/vars.js b/includes/vars.js new file mode 100644 index 0000000..0eb3d12 --- /dev/null +++ b/includes/vars.js @@ -0,0 +1,6 @@ +// Used to grab additional data in case of ingestion failure +const ingestion_buffer_days = 3; + +module.exports = { + ingestion_buffer_days +} diff --git a/workflow_settings.yaml b/workflow_settings.yaml new file mode 100644 index 0000000..196f0b7 --- /dev/null +++ b/workflow_settings.yaml @@ -0,0 +1,5 @@ +defaultProject: pphe-data-dev +defaultLocation: EU +defaultDataset: dataform +defaultAssertionDataset: dataform +dataformCoreVersion: 3.0.26 \ No newline at end of file