{"id":16599164,"url":"https://github.com/heartsavior/spark-sql-kafka-offset-committer","last_synced_at":"2025-08-18T05:11:23.614Z","repository":{"id":55655631,"uuid":"210254399","full_name":"HeartSaVioR/spark-sql-kafka-offset-committer","owner":"HeartSaVioR","description":"Kafka offset committer for structured streaming query","archived":false,"fork":false,"pushed_at":"2021-02-15T02:18:27.000Z","size":92,"stargazers_count":39,"open_issues_count":2,"forks_count":15,"subscribers_count":5,"default_branch":"develop-spark3.0","last_synced_at":"2025-07-26T14:17:43.331Z","etag":null,"topics":["kafka","spark","structured-streaming"],"latest_commit_sha":null,"homepage":"","language":"Scala","has_issues":true,"has_wiki":null,"has_pages":null,"mirror_url":null,"source_name":null,"license":"apache-2.0","status":null,"scm":"git","pull_requests_enabled":true,"icon_url":"https://github.com/HeartSaVioR.png","metadata":{"files":{"readme":"README.md","changelog":null,"contributing":null,"funding":null,"license":"LICENSE","code_of_conduct":null,"threat_model":null,"audit":null,"citation":null,"codeowners":null,"security":null,"support":null}},"created_at":"2019-09-23T03:09:47.000Z","updated_at":"2025-07-02T12:41:53.000Z","dependencies_parsed_at":"2022-08-15T05:40:43.913Z","dependency_job_id":null,"html_url":"https://github.com/HeartSaVioR/spark-sql-kafka-offset-committer","commit_stats":null,"previous_names":[],"tags_count":6,"template":false,"template_full_name":null,"purl":"pkg:github/HeartSaVioR/spark-sql-kafka-offset-committer","repository_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/HeartSaVioR%2Fspark-sql-kafka-offset-committer","tags_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/HeartSaVioR%2Fspark-sql-kafka-offset-committer/tags","releases_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/HeartSaVioR%2Fspark-sql-kafka-offset-committer/releases","manifests_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/HeartSaVioR%2Fspark-sql-kafka-offset-committer/manifests","owner_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/owners/HeartSaVioR","download_url":"https://codeload.github.com/HeartSaVioR/spark-sql-kafka-offset-committer/tar.gz/refs/heads/develop-spark3.0","sbom_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/HeartSaVioR%2Fspark-sql-kafka-offset-committer/sbom","scorecard":{"id":61765,"data":{"date":"2025-08-11","repo":{"name":"github.com/HeartSaVioR/spark-sql-kafka-offset-committer","commit":"35b8f3aafef3b9b32525752ff91150c8b8905f95"},"scorecard":{"version":"v5.2.1-40-gf6ed084d","commit":"f6ed084d17c9236477efd66e5b258b9d4cc7b389"},"score":3.5,"checks":[{"name":"Code-Review","score":0,"reason":"Found 2/30 approved changesets -- score normalized to 0","details":null,"documentation":{"short":"Determines if the project requires human code review before pull requests (aka merge requests) are merged.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#code-review"}},{"name":"Token-Permissions","score":-1,"reason":"No tokens found","details":null,"documentation":{"short":"Determines if the project's workflows follow the principle of least privilege.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#token-permissions"}},{"name":"Binary-Artifacts","score":10,"reason":"no binaries found in the repo","details":null,"documentation":{"short":"Determines if the project has generated executable (binary) artifacts in the source repository.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#binary-artifacts"}},{"name":"Packaging","score":-1,"reason":"packaging workflow not detected","details":["Warn: no GitHub/GitLab publishing workflow detected."],"documentation":{"short":"Determines if the project is published as a package that others can easily download, install, easily update, and uninstall.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#packaging"}},{"name":"Dangerous-Workflow","score":-1,"reason":"no workflows found","details":null,"documentation":{"short":"Determines if the project's GitHub Action workflows avoid dangerous patterns.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#dangerous-workflow"}},{"name":"Maintained","score":0,"reason":"0 commit(s) and 0 issue activity found in the last 90 days -- score normalized to 0","details":null,"documentation":{"short":"Determines if the project is \"actively maintained\".","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#maintained"}},{"name":"Pinned-Dependencies","score":-1,"reason":"no dependencies found","details":null,"documentation":{"short":"Determines if the project has declared and pinned the dependencies of its build process.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#pinned-dependencies"}},{"name":"CII-Best-Practices","score":0,"reason":"no effort to earn an OpenSSF best practices badge detected","details":null,"documentation":{"short":"Determines if the project has an OpenSSF (formerly CII) Best Practices Badge.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#cii-best-practices"}},{"name":"Vulnerabilities","score":10,"reason":"0 existing vulnerabilities detected","details":null,"documentation":{"short":"Determines if the project has open, known unfixed vulnerabilities.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#vulnerabilities"}},{"name":"Fuzzing","score":0,"reason":"project is not fuzzed","details":["Warn: no fuzzer integrations found"],"documentation":{"short":"Determines if the project uses fuzzing.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#fuzzing"}},{"name":"Security-Policy","score":0,"reason":"security policy file not detected","details":["Warn: no security policy file detected","Warn: no security file to analyze","Warn: no security file to analyze","Warn: no security file to analyze"],"documentation":{"short":"Determines if the project has published a security policy.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#security-policy"}},{"name":"License","score":10,"reason":"license file detected","details":["Info: project has a license file: LICENSE:0","Info: FSF or OSI recognized license: Apache License 2.0: LICENSE:0"],"documentation":{"short":"Determines if the project has defined a license.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#license"}},{"name":"Signed-Releases","score":-1,"reason":"no releases found","details":null,"documentation":{"short":"Determines if the project cryptographically signs release artifacts.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#signed-releases"}},{"name":"Branch-Protection","score":-1,"reason":"internal error: error during branchesHandler.setup: internal error: githubv4.Query: Resource not accessible by integration","details":null,"documentation":{"short":"Determines if the default and release branches are protected with GitHub's branch protection settings.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#branch-protection"}},{"name":"SAST","score":0,"reason":"SAST tool is not run on all commits -- score normalized to 0","details":["Warn: 0 commits out of 2 are checked with a SAST tool"],"documentation":{"short":"Determines if the project uses static code analysis.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#sast"}}]},"last_synced_at":"2025-08-15T01:52:13.809Z","repository_id":55655631,"created_at":"2025-08-15T01:52:13.810Z","updated_at":"2025-08-15T01:52:13.810Z"},"host":{"name":"GitHub","url":"https://github.com","kind":"github","repositories_count":270946068,"owners_count":24672890,"icon_url":"https://github.com/github.png","version":null,"created_at":"2022-05-30T11:31:42.601Z","updated_at":"2022-07-04T15:15:14.044Z","status":"online","status_checked_at":"2025-08-18T02:00:08.743Z","response_time":89,"last_error":null,"robots_txt_status":"success","robots_txt_updated_at":"2025-07-24T06:49:26.215Z","robots_txt_url":"https://github.com/robots.txt","online":true,"can_crawl_api":true,"host_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub","repositories_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories","repository_names_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repository_names","owners_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/owners"}},"keywords":["kafka","spark","structured-streaming"],"created_at":"2024-10-12T00:10:34.333Z","updated_at":"2025-08-18T05:11:23.484Z","avatar_url":"https://github.com/HeartSaVioR.png","language":"Scala","funding_links":[],"categories":[],"sub_categories":[],"readme":"# Kafka offset committer for Spark structured streaming\n\n[![CircleCI](https://circleci.com/gh/HeartSaVioR/spark-sql-kafka-offset-committer/tree/master.svg?style=svg)](https://circleci.com/gh/HeartSaVioR/spark-sql-kafka-offset-committer/tree/master)\n\nKafka offset committer helps structured streaming query which uses Kafka Data Source to commit offsets which batch has been processed.\n\nThis project is not for replacing checkpoint mechanism of Spark with Kafka's one. To provide full of \"fault-tolerance\" semantic, Spark has to take 100% of control of manipulating checkpoint, and Kafka data source is no exception. This project can be used to leverage Kafka ecosystem tools to track the committed offsets on Spark checkpoint, which is not possible solely with Spark.\n\nThis project is inspired by [SPARK-27549](https://issues.apache.org/jira/browse/SPARK-27549), which proposed to add this feature in Spark codebase, but the decision was taken as not include to Spark. You can call this project as a \"follow-up\" of SPARK-27549. This project is also inspired by [Spark Atlas Connector](https://github.com/hortonworks-spark/spark-atlas-connector) - SAC leverages Scala reflection to extract topic information from query execution. Kafka offset committer uses the same approach to extract Kafka parameters. Credits to everyone involved SPARK-27549 \u0026 SAC.\n\n## Supported versions\n\nBoth Spark 3.0.x and 2.4.x is supported: it only means you should use these versions when using this project.\n\nThe project provides cross-compile for Scala 2.11 and 2.12 (thanks [@redsk](https://github.com/redsk)!) for Spark 2.4.x; please pick the right artifact for your Scala version.\n\nSpark version | Scala versions | artifact version\n------------- | -------------- | ----------------\n2.4.x         | 2.11 / 2.12    | 0.4.0-spark-2.4\n3.0.x         | 2.12           | 0.4.0-spark-3.0\n\n## How to import\n\nAdd this to your maven pom.xml file. If you're using other builds like groovy or sbt or so, please import the artifact accordingly; groupId: `net.heartsavior.spark`, artifactId: `spark-sql-kafka-offset-committer_\u003cscala_version\u003e`.\n\nPlease replace `{{...}}` with content in above matrix:\n\n```\n\u003cdependency\u003e\n  \u003cgroupId\u003enet.heartsavior.spark\u003c/groupId\u003e\n  \u003cartifactId\u003espark-sql-kafka-offset-committer_{{scala_version}}\u003c/artifactId\u003e\n  \u003cversion\u003e{{artifact_version}}\u003c/version\u003e\n\u003c/dependency\u003e\n```\n\nYou can dynamically include jar file while submitting, via leveraging `--packages` option. `--packages net.heartsavior.spark:spark-sql-kafka-offset-committer:0.1.0`. You may want to add `--conf spark.sql.streaming.streamingQueryListeners=net.heartsavior.spark.KafkaOffsetCommitterListener` as well, since you're dynamically adding the jar, hence the class is not accessible in your uber jar.\n\n## How to use\n\nKafka offset committer is implemented as StreamingQueryListener. There're two approaches to enable streaming query listener:\n\n1. Attach the instance of `KafkaOffsetCommitterListener` via below:\n\n```scala\nval listener = new KafkaOffsetCommitterListener()\nspark.streams.addListener(listener)\n```\n\n2. Add `net.heartsavior.spark.KafkaOffsetCommitterListener` to the value of `spark.sql.streaming.streamingQueryListeners` in your Spark config.\n(The value is separated by `,` so you can add multiple listeners if you have any other listeners.) \n\nOnce the listener is set, you can add special option to Kafka data source options so that Kafka committer can see the `groupId` to commit:\n\n```scala\nspark.readStream\n  .format(\"kafka\")\n  .option(\"kafka.bootstrap.servers\", \"localhost:9092\")\n  .option(\"subscribePattern\", \"topic[1-3]\")\n  .option(\"startingOffsets\", \"earliest\")\n  .option(\"kafka.consumer.commit.groupid\", \"groupId1\")\n  .load()\n``` \n\n\"kafka.consumer.commit.groupid\" is the new config to specify consumer group ID to commit. Manually specifying consumer group ID is needed, because Spark will\nassign unique consumer group ID to avoid multiple queries being conflicted to each other. This also means, you may want to thoughtfully set the option and\n decide the name of group ID so that multiple queries don't use the same group ID for committing.\n\nDue to technical reason, the project uses reflection to extract options from query execution. Given we intercept Kafka parameters instead of source options\n of DataSource, adding \"kafka.\" to option key is necessary and it brings unintended warning messages from Kafka side. (Sorry!) You can adjust your log4j config\nto hide the warning messages.\n\nHere's an example of command to run spark-shell with kafka committer listener being set, and simple query to read from Kafka topics and write to Kafka topic.\n\n\u003e command\n\n```\n./bin/spark-shell --master \"local[3]\" --packages org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.3 --jars ./spark-sql-kafka-offset-committer-0.1.0-SNAPSHOT.jar --conf spark.sql.streaming.streamingQueryListeners=net.heartsavior.spark.KafkaOffsetCommitterListener\n```\n\n\u003e query\n\n```scala\nval bootstrapServers = \"localhost:9092\"\nval checkpointLocation = \"/tmp/mykafkaaaaaaa\"\nval sourceTopics = Seq(\"truck_events_stream\").mkString(\",\")\nval sourceTopics2 = Seq(\"truck_speed_events_stream\").mkString(\",\")\n\nval targetTopic = \"sparksinkstreaming\"\n\nval df = spark.readStream.format(\"kafka\").option(\"kafka.bootstrap.servers\", bootstrapServers).option(\"subscribe\", sourceTopics).option(\"startingOffsets\", \"earliest\").option(\"kafka.consumer.commit.groupid\", \"spark-sql-kafka-offset-committer-test-1\").load()\n\nval df2 = spark.readStream.format(\"kafka\").option(\"kafka.bootstrap.servers\", bootstrapServers).option(\"subscribe\", sourceTopics2).option(\"startingOffsets\", \"earliest\").option(\"kafka.consumer.commit.groupid\", \"spark-sql-kafka-offset-committer-test-1\").load()\n\nval query = df.union(df2).writeStream.format(\"kafka\").option(\"kafka.bootstrap.servers\", bootstrapServers).option(\"checkpointLocation\", checkpointLocation).option(\"topic\", targetTopic).option(\"kafka.atlas.cluster.name\", \"sink\").start()\n```\n\n\u003e result\n\n```\n$ kafka-consumer-groups --bootstrap-server localhost:9092 --describe --group spark-sql-kafka-offset-committer-test-1\nConsumer group 'spark-sql-kafka-offset-committer-test-1' has no active members.\n\nTOPIC                                    PARTITION  CURRENT-OFFSET  LOG-END-OFFSET  LAG             CONSUMER-ID     HOST            CLIENT-ID\ntruck_speed_events_stream                5          844553          844577          24              -               -               -\ntruck_speed_events_stream                2          675521          675540          19              -               -               -\ntruck_speed_events_stream                6          168828          168833          5               -               -               -\ntruck_speed_events_stream                3          337819          337827          8               -               -               -\ntruck_speed_events_stream                7          675566          675585          19              -               -               -\ntruck_speed_events_stream                4          168914          168919          5               -               -               -\ntruck_speed_events_stream                0          168894          168899          5               -               -               -\ntruck_speed_events_stream                8          675570          675589          19              -               -               -\ntruck_speed_events_stream                1          168917          168922          5               -               -               -\ntruck_events_stream                      0          3884586         3884695         109             -               -               -\ntruck_speed_events_stream                9          0               0               0               -               -               -\n```\n\nAfter stopping ingestion of records and waiting for query to fully process the records:\n\n```\n$ kafka-consumer-groups --bootstrap-server localhost:9092 --describe --group spark-sql-kafka-offset-committer-test-1\nConsumer group 'spark-sql-kafka-offset-committer-test-1' has no active members.\n\nTOPIC                                    PARTITION  CURRENT-OFFSET  LOG-END-OFFSET  LAG             CONSUMER-ID     HOST            CLIENT-ID\ntruck_speed_events_stream                5          856338          856338          0               -               -               -\ntruck_speed_events_stream                2          684958          684958          0               -               -               -\ntruck_speed_events_stream                6          171186          171186          0               -               -               -\ntruck_speed_events_stream                3          342534          342534          0               -               -               -\ntruck_speed_events_stream                7          684998          684998          0               -               -               -\ntruck_speed_events_stream                4          171272          171272          0               -               -               -\ntruck_speed_events_stream                0          171255          171255          0               -               -               -\ntruck_speed_events_stream                8          684999          684999          0               -               -               -\ntruck_speed_events_stream                1          171276          171276          0               -               -               -\ntruck_events_stream                      0          3938820         3938820         0               -               -               -\ntruck_speed_events_stream                9          0               0               0               -               -               -\n```\n\n\n## License\n\nCopyright 2019-2021 Jungtaek Lim \"\u003ckabhwan@gmail.com\u003e\"\n\nLicensed under the Apache License, Version 2.0 (the \"License\");\nyou may not use this file except in compliance with the License.\nYou may obtain a copy of the License at\n\nhttp://www.apache.org/licenses/LICENSE-2.0\n\nUnless required by applicable law or agreed to in writing, software\ndistributed under the License is distributed on an \"AS IS\" BASIS,\nWITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nSee the License for the specific language governing permissions and\nlimitations under the License. \n","project_url":"https://awesome.ecosyste.ms/api/v1/projects/github.com%2Fheartsavior%2Fspark-sql-kafka-offset-committer","html_url":"https://awesome.ecosyste.ms/projects/github.com%2Fheartsavior%2Fspark-sql-kafka-offset-committer","lists_url":"https://awesome.ecosyste.ms/api/v1/projects/github.com%2Fheartsavior%2Fspark-sql-kafka-offset-committer/lists"}