{"id":31696669,"url":"https://github.com/khushneetsingh/datasanity","last_synced_at":"2026-04-08T20:44:37.124Z","repository":{"id":318223665,"uuid":"1070395689","full_name":"KhushneetSingh/DataSanity","owner":"KhushneetSingh","description":"DataSanity is a AI-powered web application for dataset cleaning, synthetic data generation, vectorization, and data enrichment using natural language prompts.","archived":false,"fork":false,"pushed_at":"2025-10-05T22:15:58.000Z","size":10823,"stargazers_count":0,"open_issues_count":0,"forks_count":0,"subscribers_count":0,"default_branch":"main","last_synced_at":"2025-10-05T23:28:34.613Z","etag":null,"topics":["cerebrus","exa","faiss-vector-database","llm","nextjs","numpy","pandas","serperdev","sqlite","tailwindcss"],"latest_commit_sha":null,"homepage":"","language":"JavaScript","has_issues":true,"has_wiki":null,"has_pages":null,"mirror_url":null,"source_name":null,"license":"apache-2.0","status":null,"scm":"git","pull_requests_enabled":true,"icon_url":"https://github.com/KhushneetSingh.png","metadata":{"files":{"readme":"README.md","changelog":null,"contributing":null,"funding":null,"license":"LICENSE","code_of_conduct":null,"threat_model":null,"audit":null,"citation":null,"codeowners":null,"security":null,"support":null,"governance":null,"roadmap":null,"authors":null,"dei":null,"publiccode":null,"codemeta":null,"zenodo":null,"notice":null,"maintainers":null,"copyright":null,"agents":null,"dco":null,"cla":null}},"created_at":"2025-10-05T20:48:20.000Z","updated_at":"2025-10-05T22:16:01.000Z","dependencies_parsed_at":"2025-10-05T23:28:38.967Z","dependency_job_id":"38b28dbb-321e-4039-91a3-f82ae48b57b6","html_url":"https://github.com/KhushneetSingh/DataSanity","commit_stats":null,"previous_names":["khushneetsingh/datasanity"],"tags_count":null,"template":false,"template_full_name":null,"purl":"pkg:github/KhushneetSingh/DataSanity","repository_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/KhushneetSingh%2FDataSanity","tags_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/KhushneetSingh%2FDataSanity/tags","releases_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/KhushneetSingh%2FDataSanity/releases","manifests_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/KhushneetSingh%2FDataSanity/manifests","owner_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/owners/KhushneetSingh","download_url":"https://codeload.github.com/KhushneetSingh/DataSanity/tar.gz/refs/heads/main","sbom_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/KhushneetSingh%2FDataSanity/sbom","scorecard":null,"host":{"name":"GitHub","url":"https://github.com","kind":"github","repositories_count":278981518,"owners_count":26079640,"icon_url":"https://github.com/github.png","version":null,"created_at":"2022-05-30T11:31:42.601Z","updated_at":"2022-07-04T15:15:14.044Z","status":"online","status_checked_at":"2025-10-08T02:00:06.501Z","response_time":56,"last_error":null,"robots_txt_status":"success","robots_txt_updated_at":"2025-07-24T06:49:26.215Z","robots_txt_url":"https://github.com/robots.txt","online":true,"can_crawl_api":true,"host_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub","repositories_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories","repository_names_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repository_names","owners_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/owners"}},"keywords":["cerebrus","exa","faiss-vector-database","llm","nextjs","numpy","pandas","serperdev","sqlite","tailwindcss"],"created_at":"2025-10-08T17:11:38.463Z","updated_at":"2026-04-08T20:44:37.119Z","avatar_url":"https://github.com/KhushneetSingh.png","language":"JavaScript","funding_links":[],"categories":[],"sub_categories":[],"readme":"# DataSanity\n\nDataSanity is an AI-powered web application for dataset cleaning, synthetic data generation, vectorization, and data enrichment using natural language prompts.\n\n## Features\n\n- Dataset cleaning with LLM detection of noisy, missing, or duplicate values\n- Synthetic data generation based on schema or prompt\n- Vectorization for RAG pipelines\n- Data enrichment using web search APIs\n- Natural language prompt-based workflow\n- Support for CSV uploads and downloads\n\n## Tech Stack\n\n- Frontend: Next.js with Tailwind CSS\n- Backend: FastAPI (Python)\n- LLM Inference: Cerebras API\n- Data Processing: pandas, numpy\n- Embedding: sentence-transformers\n- Vector Store: FAISS\n- Web Search: Exa or Serper.dev\n- Storage: SQLite + local filesystem\n","project_url":"https://awesome.ecosyste.ms/api/v1/projects/github.com%2Fkhushneetsingh%2Fdatasanity","html_url":"https://awesome.ecosyste.ms/projects/github.com%2Fkhushneetsingh%2Fdatasanity","lists_url":"https://awesome.ecosyste.ms/api/v1/projects/github.com%2Fkhushneetsingh%2Fdatasanity/lists"}