{"id":32388699,"url":"https://github.com/data-integrations/to-utf8-action","last_synced_at":"2025-10-25T03:55:13.048Z","repository":{"id":42121275,"uuid":"88241421","full_name":"data-integrations/to-utf8-action","owner":"data-integrations","description":"An action plugin to convert files created in other character sets into UTF-8 format","archived":false,"fork":false,"pushed_at":"2022-04-12T21:54:21.000Z","size":161,"stargazers_count":0,"open_issues_count":5,"forks_count":2,"subscribers_count":5,"default_branch":"develop","last_synced_at":"2024-04-16T07:44:28.903Z","etag":null,"topics":["cask-marketplace","cdap","cdap-plugin","character-set","utf-8"],"latest_commit_sha":null,"homepage":"","language":"Java","has_issues":true,"has_wiki":null,"has_pages":null,"mirror_url":null,"source_name":null,"license":"apache-2.0","status":null,"scm":"git","pull_requests_enabled":true,"icon_url":"https://github.com/data-integrations.png","metadata":{"files":{"readme":"README.md","changelog":null,"contributing":null,"funding":null,"license":"LICENSE.txt","code_of_conduct":null,"threat_model":null,"audit":null,"citation":null,"codeowners":null,"security":null,"support":null}},"created_at":"2017-04-14T06:51:22.000Z","updated_at":"2019-09-23T12:11:58.000Z","dependencies_parsed_at":"2022-08-12T07:10:48.751Z","dependency_job_id":null,"html_url":"https://github.com/data-integrations/to-utf8-action","commit_stats":null,"previous_names":[],"tags_count":0,"template":false,"template_full_name":null,"purl":"pkg:github/data-integrations/to-utf8-action","repository_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/data-integrations%2Fto-utf8-action","tags_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/data-integrations%2Fto-utf8-action/tags","releases_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/data-integrations%2Fto-utf8-action/releases","manifests_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/data-integrations%2Fto-utf8-action/manifests","owner_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/owners/data-integrations","download_url":"https://codeload.github.com/data-integrations/to-utf8-action/tar.gz/refs/heads/develop","sbom_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/data-integrations%2Fto-utf8-action/sbom","scorecard":{"id":324124,"data":{"date":"2025-08-11","repo":{"name":"github.com/data-integrations/to-utf8-action","commit":"4af754260fa4ee92b96eb99a2e6b7869fd4711db"},"scorecard":{"version":"v5.2.1-40-gf6ed084d","commit":"f6ed084d17c9236477efd66e5b258b9d4cc7b389"},"score":3.4,"checks":[{"name":"Packaging","score":-1,"reason":"packaging workflow not detected","details":["Warn: no GitHub/GitLab publishing workflow detected."],"documentation":{"short":"Determines if the project is published as a package that others can easily download, install, easily update, and uninstall.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#packaging"}},{"name":"Code-Review","score":3,"reason":"Found 3/8 approved changesets -- score normalized to 3","details":null,"documentation":{"short":"Determines if the project requires human code review before pull requests (aka merge requests) are merged.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#code-review"}},{"name":"Binary-Artifacts","score":10,"reason":"no binaries found in the repo","details":null,"documentation":{"short":"Determines if the project has generated executable (binary) artifacts in the source repository.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#binary-artifacts"}},{"name":"Maintained","score":0,"reason":"0 commit(s) and 0 issue activity found in the last 90 days -- score normalized to 0","details":null,"documentation":{"short":"Determines if the project is \"actively maintained\".","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#maintained"}},{"name":"Dangerous-Workflow","score":-1,"reason":"no workflows found","details":null,"documentation":{"short":"Determines if the project's GitHub Action workflows avoid dangerous patterns.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#dangerous-workflow"}},{"name":"Pinned-Dependencies","score":-1,"reason":"no dependencies found","details":null,"documentation":{"short":"Determines if the project has declared and pinned the dependencies of its build process.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#pinned-dependencies"}},{"name":"Token-Permissions","score":-1,"reason":"No tokens found","details":null,"documentation":{"short":"Determines if the project's workflows follow the principle of least privilege.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#token-permissions"}},{"name":"CII-Best-Practices","score":0,"reason":"no effort to earn an OpenSSF best practices badge detected","details":null,"documentation":{"short":"Determines if the project has an OpenSSF (formerly CII) Best Practices Badge.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#cii-best-practices"}},{"name":"Fuzzing","score":0,"reason":"project is not fuzzed","details":["Warn: no fuzzer integrations found"],"documentation":{"short":"Determines if the project uses fuzzing.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#fuzzing"}},{"name":"License","score":10,"reason":"license file detected","details":["Info: project has a license file: LICENSE.txt:0","Info: FSF or OSI recognized license: Apache License 2.0: LICENSE.txt:0"],"documentation":{"short":"Determines if the project has defined a license.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#license"}},{"name":"Signed-Releases","score":-1,"reason":"no releases found","details":null,"documentation":{"short":"Determines if the project cryptographically signs release artifacts.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#signed-releases"}},{"name":"Security-Policy","score":0,"reason":"security policy file not detected","details":["Warn: no security policy file detected","Warn: no security file to analyze","Warn: no security file to analyze","Warn: no security file to analyze"],"documentation":{"short":"Determines if the project has published a security policy.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#security-policy"}},{"name":"Branch-Protection","score":0,"reason":"branch protection not enabled on development/release branches","details":["Warn: branch protection not enabled for branch 'develop'"],"documentation":{"short":"Determines if the default and release branches are protected with GitHub's branch protection settings.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#branch-protection"}},{"name":"SAST","score":0,"reason":"SAST tool is not run on all commits -- score normalized to 0","details":["Warn: 0 commits out of 19 are checked with a SAST tool"],"documentation":{"short":"Determines if the project uses static code analysis.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#sast"}},{"name":"Vulnerabilities","score":10,"reason":"0 existing vulnerabilities detected","details":null,"documentation":{"short":"Determines if the project has open, known unfixed vulnerabilities.","url":"https://github.com/ossf/scorecard/blob/f6ed084d17c9236477efd66e5b258b9d4cc7b389/docs/checks.md#vulnerabilities"}}]},"last_synced_at":"2025-08-18T02:04:44.034Z","repository_id":42121275,"created_at":"2025-08-18T02:04:44.034Z","updated_at":"2025-08-18T02:04:44.034Z"},"host":{"name":"GitHub","url":"https://github.com","kind":"github","repositories_count":280901444,"owners_count":26410586,"icon_url":"https://github.com/github.png","version":null,"created_at":"2022-05-30T11:31:42.601Z","updated_at":"2022-07-04T15:15:14.044Z","status":"online","status_checked_at":"2025-10-25T02:00:06.499Z","response_time":81,"last_error":null,"robots_txt_status":"success","robots_txt_updated_at":"2025-07-24T06:49:26.215Z","robots_txt_url":"https://github.com/robots.txt","online":true,"can_crawl_api":true,"host_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub","repositories_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories","repository_names_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repository_names","owners_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/owners"}},"keywords":["cask-marketplace","cdap","cdap-plugin","character-set","utf-8"],"created_at":"2025-10-25T03:55:09.692Z","updated_at":"2025-10-25T03:55:13.040Z","avatar_url":"https://github.com/data-integrations.png","language":"Java","readme":"\u003ca href=\"https://cdap-users.herokuapp.com/\"\u003e\u003cimg alt=\"Join CDAP community\" src=\"https://cdap-users.herokuapp.com/badge.svg?t=to-utf8-action\"/\u003e\u003c/a\u003e [![Build Status](https://travis-ci.org/hydrator/to-utf8-action.svg?branch=release/1.0)](https://travis-ci.org/hydrator/to-utf8-action) [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) \u003cimg alt=\"CDAP Action\" src=\"https://cdap-users.herokuapp.com/assets/cdap-action.svg\"/\u003e []() \u003cimg src=\"https://cdap-users.herokuapp.com/assets/cm-available.svg\"/\u003e\n\nTo UTF-8 Action Plugin\n======================\n\nThe To UTF-8 Action is used to convert files created in other character sets\ninto UTF-8 format so that they can be processed using standard Hadoop Text Input Formats.\nDue to [MAPREDUCE-232](https://issues.apache.org/jira/browse/MAPREDUCE-232), files created\nin other charsets must be converted to UTF-8 before being processed. This plugin supports\nany character set listed under ``java.nio`` in \nthe [Java Supported Encodings Documentation](https://docs.oracle.com/javase/8/docs/technotes/guides/intl/encoding.doc.html).\n\n\u003cimg align=\"center\" src=\"docs/plugin-to-utf8-action.png\"  width=\"400\" alt=\"plugin configuration\" /\u003e\n\nPlugin Configuration\n---------------------\n\n| Configuration | Required | Default | Description |\n| :------------ | :------: | :------ | :---------- |\n| **Source Path** | **Y** | None | The full path of the file or directory that is to be converted. In the case of a directory, if fileRegex is set, then only files in the source directory matching the regex expression will be moved. Otherwise, all files in the directory will be moved. For example: `hdfs://hostname/tmp`. You can use globbing syntax here. |\n| **Destination Path** | **Y** | None | The full path where the file or files are to be saved. If a directory is specified the files will be created in that directory. If the Source Path is a directory, it is assumed that Destination Path is also a directory. The new files will have ``.utf8`` appended to the end. Files with the same name will be overwritten. |\n| **File Regular Expression** | **N** | None | Regular expression to filter the files in the source directory that will be moved. This is useful when the globbing syntax in the source directory is not precise enough for your files. |\n| **Character Set** | **Y** | None| The name of the character set used to create the file. The complete list of supported character sets can be found in the [Java Supported Encodings Documentation](https://docs.oracle.com/javase/8/docs/technotes/guides/intl/encoding.doc.html). |\n| **Continue Processing If There Are Errors?** | **Y** | false | Indicates if the pipeline should continue if processing the files fails. |\n\nUsage Notes\n-----------\n\nThis plugin can be very useful for converting sets of files to UTF-8 prior to processing them in MapReduce or Spark. Because this action runs as a single process prior to the MapReduce or Spark job, it can take a considerable amount of time to convert large files. If that is the case, you may be better off writing a custom Input format for handling that data.\n \nThe files are created in the destination folder with ``.utf8`` appended to them. Most likely, you will want to use the ``HDFSDelete`` action at the end of the pipeline to clean up these files.\n\nWhen using the Wrangler tool, you will want to wrangle with the UTF-8 converted file to get the best results.\n\n\nGetting Started\n===============\n\nPrerequisites\n--------------\nCDAP version 4.1.x or higher.\n\nBuilding Plugins\n----------------\nYou get started with To UTF-8 action plugin by building directly from the latest source code::\n\n   git clone git@github.com:hydrator/to-utf8-action.git\n   cd to-utf8-action\n   mvn clean package\n\nAfter the build completes, you will have a JAR for each plugin under each\n``\u003cplugin-name\u003e/target/`` directory.\n\nDeploying Plugins\n-----------------\nYou can deploy a plugin using the CDAP CLI::\n\n  \u003e load artifact \u003ctarget/plugin-jar\u003e config-file \u003cresources/plugin-config\u003e\n\n  \u003e load artifact target/to-utf8-action-\u003cversion\u003e.jar \\\n         config-file target/to-utf8-action-\u003cversion\u003e.json\n\nYou can build without running tests: ``mvn clean install -DskipTests``\n\nMailing Lists\n-------------\nCDAP User Group and Development Discussions:\n\n- `cdap-user@googlegroups.com \u003chttps://groups.google.com/d/forum/cdap-user\u003e`__\n\nThe *cdap-user* mailing list is primarily for users using the product to develop\napplications or building plugins for appplications. You can expect questions from\nusers, release announcements, and any other discussions that we think will be helpful\nto the users.\n\nIRC Channel\n-----------\nCDAP IRC Channel: #cdap on irc.freenode.net\n\n\nLicense and Trademarks\n======================\n\nCopyright © 2017 Cask Data, Inc.\n\nLicensed under the Apache License, Version 2.0 (the \"License\"); you may not use this file except\nin compliance with the License. You may obtain a copy of the License at\n\nhttp://www.apache.org/licenses/LICENSE-2.0\n\nUnless required by applicable law or agreed to in writing, software distributed under the\nLicense is distributed on an \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,\neither express or implied. See the License for the specific language governing permissions\nand limitations under the License.\n\nCask is a trademark of Cask Data, Inc. All rights reserved.\n\nApache, Apache HBase, and HBase are trademarks of The Apache Software Foundation. Used with\npermission. No endorsement by The Apache Software Foundation is implied by the use of these marks.\n","funding_links":[],"categories":[],"sub_categories":[],"project_url":"https://awesome.ecosyste.ms/api/v1/projects/github.com%2Fdata-integrations%2Fto-utf8-action","html_url":"https://awesome.ecosyste.ms/projects/github.com%2Fdata-integrations%2Fto-utf8-action","lists_url":"https://awesome.ecosyste.ms/api/v1/projects/github.com%2Fdata-integrations%2Fto-utf8-action/lists"}