{"id":27798103,"url":"https://github.com/bilgeswe/bigdatamanagement","last_synced_at":"2026-01-22T13:48:44.164Z","repository":{"id":286329674,"uuid":"961078969","full_name":"bilgeswe/BigDataManagement","owner":"bilgeswe","description":"Building a Data Pipeline with Lakehouse Architecture on Microsoft Azure Platform","archived":false,"fork":false,"pushed_at":"2025-04-05T18:38:16.000Z","size":2119,"stargazers_count":0,"open_issues_count":0,"forks_count":0,"subscribers_count":1,"default_branch":"main","last_synced_at":"2025-04-30T22:55:29.903Z","etag":null,"topics":["azure","azure-pipelines","azure-service","azure-storage","big-data","big-data-analytics","big-data-processing","data-visualization","datalake-ingestion","dataset","kaggle","sql","uml-diagram"],"latest_commit_sha":null,"homepage":"","language":"TSQL","has_issues":true,"has_wiki":null,"has_pages":null,"mirror_url":null,"source_name":null,"license":null,"status":null,"scm":"git","pull_requests_enabled":true,"icon_url":"https://github.com/bilgeswe.png","metadata":{"files":{"readme":"README.md","changelog":null,"contributing":null,"funding":null,"license":null,"code_of_conduct":null,"threat_model":null,"audit":null,"citation":null,"codeowners":null,"security":null,"support":null,"governance":null,"roadmap":null,"authors":null,"dei":null,"publiccode":null,"codemeta":null,"zenodo":null}},"created_at":"2025-04-05T17:53:09.000Z","updated_at":"2025-04-05T21:30:17.000Z","dependencies_parsed_at":null,"dependency_job_id":"0f91ed1c-b47b-4454-9405-b233a2599a11","html_url":"https://github.com/bilgeswe/BigDataManagement","commit_stats":null,"previous_names":["bilgeswe/bigdatamanagement"],"tags_count":0,"template":false,"template_full_name":null,"purl":"pkg:github/bilgeswe/BigDataManagement","repository_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/bilgeswe%2FBigDataManagement","tags_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/bilgeswe%2FBigDataManagement/tags","releases_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/bilgeswe%2FBigDataManagement/releases","manifests_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/bilgeswe%2FBigDataManagement/manifests","owner_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/owners/bilgeswe","download_url":"https://codeload.github.com/bilgeswe/BigDataManagement/tar.gz/refs/heads/main","sbom_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/bilgeswe%2FBigDataManagement/sbom","scorecard":null,"host":{"name":"GitHub","url":"https://github.com","kind":"github","repositories_count":286080680,"owners_count":28664022,"icon_url":"https://github.com/github.png","version":null,"created_at":"2022-05-30T11:31:42.601Z","updated_at":"2026-01-22T01:17:37.254Z","status":"online","status_checked_at":"2026-01-22T02:00:07.137Z","response_time":144,"last_error":null,"robots_txt_status":"success","robots_txt_updated_at":"2025-07-24T06:49:26.215Z","robots_txt_url":"https://github.com/robots.txt","online":true,"can_crawl_api":true,"host_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub","repositories_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories","repository_names_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repository_names","owners_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/owners"}},"keywords":["azure","azure-pipelines","azure-service","azure-storage","big-data","big-data-analytics","big-data-processing","data-visualization","datalake-ingestion","dataset","kaggle","sql","uml-diagram"],"created_at":"2025-04-30T22:55:29.355Z","updated_at":"2026-01-22T13:48:44.158Z","avatar_url":"https://github.com/bilgeswe.png","language":"TSQL","funding_links":[],"categories":[],"sub_categories":[],"readme":"# BigDataManagement\nBuilding a Data Pipeline with Lakehouse Architecture on Microsoft Azure Platform\n\n## INTRODUCTION\nIn today’s data-driven world, the ability to efficiently process, analyze, and derive\ninsights from large datasets is critical for organizations across various industries. This\nproject focuses on building an end-to-end data pipeline to analyze Netflix's content\ndataset, leveraging Azure cloud services to implement a scalable and reliable solution.\nThe project follows a structured data pipeline model, transitioning data through ingestion,\nprocessing, storage, and serving layers, to create actionable insights for business and\nacademic purposes. The goal is to identify trends in Netflix content production across\nregions and categories, such as the proportion of modern vs. classic content and the\ndistribution of content duration and count by country. The pipeline design ensures\nflexibility, automation, and adaptability to future data needs, adhering to best practices in\ndata engineering.\nFor the researcher, this study was eye opening as it was the first time for using many\ntools at hand.\n\n## Key Components\nAzure Data Factory (ADF): Used for orchestrating and automating data ingestion and\ntransformation processes.\nAzure Data Lake Storage Gen2: Serves as the storage layer, structured into Bronze,\nSilver, and Gold layers for raw, processed, and analytics-ready data.\nAzure Synapse Analytics: Enables querying, visualizing, and analyzing data using SQL\nexternal tables.\nExternal Tables: Facilitates the serving layer by providing seamless access to processed\ndata stored in the \"Gold\" layer.\nVisualization Tools: SQL-based visualizations in Azure Synapse replace Power BI due\nto subscription constraints.\n\n### Keywords\nData Pipeline, Azure Cloud Services, Data Factory, Data Lake Storage, Synapse\nAnalytics, Content Analysis, Modern vs. Classic Content, Regional Content Trends,\nMachine Learning Integration\n\nSource Material: https://www.kaggle.com/datasets/shivamb/netflix-shows\n","project_url":"https://awesome.ecosyste.ms/api/v1/projects/github.com%2Fbilgeswe%2Fbigdatamanagement","html_url":"https://awesome.ecosyste.ms/projects/github.com%2Fbilgeswe%2Fbigdatamanagement","lists_url":"https://awesome.ecosyste.ms/api/v1/projects/github.com%2Fbilgeswe%2Fbigdatamanagement/lists"}