https://github.com/michbur/amyloid_ngram

Last synced: 5 months ago
JSON representation
Host: GitHub
URL: https://github.com/michbur/amyloid_ngram
Owner: michbur
Created: 2015-05-08T11:29:35.000Z (about 10 years ago)
Default Branch: master
Last Pushed: 2015-11-09T12:04:50.000Z (over 9 years ago)
Last Synced: 2025-02-18T04:04:47.595Z (5 months ago)
Language: HTML
Size: 103 MB
Stars: 0
Watchers: 3
Forks: 0
Open Issues: 0
Metadata Files:
- Readme: readme.Rmd
Awesome Lists containing this project

README

        Readme

========================================================

Date: `r format(Sys.Date(), format="%B %d %Y")`

### List of files

```{r,echo=FALSE,results='asis',warning=FALSE}

library(xtable)

#list of descriptions

known_files <- list(readme = list(file_name = "readme.Rmd",

                                  descr = "A template for automatically generated readme. Date and list of file included."),

                    parsed_readme = list(file_name = "readme.md",

                                         descr = paste0("Parsed readme.Rmd")),

                    read_data = list(file_name = "read_data.R",

                                 descr = "Read data from .fasta files"),

                    amyloid_pos = list(file_name = "amyloid_pos.fasta",

                                       descr = "Positive (amyloid) sequences."),

                    amyloid_neg = list(file_name = "amyloid_neg.fasta",

                                          descr = "Negative (non-amyloid) sequences."),

                    analyze_ngrams_function = list(file_name = "analyze_ngrams_function.R",

                                                   descr = "A function that extracts important n-grams and calculate their frequency."),

                    ngram_analysis = list(file_name = "ngram_analysis.R",

                                          descr = "Analysis of n-grams extracted from amyloids. The analysis was performed separately for several groups of peptides."), 

                    report1 = list(file_name = "report1.Rmd",

                                   descr = "First report containing results from ngram_analysis.R"),

                    parsed_report1 = list(file_name = "report1.html",

                                   descr = "Parsed report1.Rmd"),

                    aa_encodings2 = list(file_name = "aa_encodings2.R",

                                   descr = "Amino acid encodings used in the report3. Uses AAIndex database."),

                    aa_encodings = list(file_name = "aa_encodings.R",

                                   descr = "Amino acid encodings used in the report2. Uses some older variant AAIndex database (?)."),

                    AA_index_mk2 = list(file_name = "AA_index_mk2.csv",

                                   descr = "Properties from AAIndex annotated by MK."),

                    aa_nprop = list(file_name = "aa_nprop.RData",

                                   descr = "Normalized values of properties from AAIndex."),

                    aa_properties = list(file_name = "aa_properties.R",

                                   descr = "Extract from seqinr package AAIndex data, annotate it partially and save."),

                    all_sizes = list(file_name = "all_sizes.csv",

                                   descr = "Created by ngram_analysis.R, contains important n-grams and their properties in amyloids and non-amyloids."),

                    amyloid_data = list(file_name = "amyloid_data.csv",

                                   descr = "Data set from Gasior, Kotulska 2015."),

                    amyloid_data_extraction = list(file_name = "amyloid_data_extraction.R",

                                   descr = "Extraction of data from amyloid_data.csv, some machine learning. Done for presentation.Rnw"),

                    amyloid_fold_res2 = list(file_name = "amyloid_fold_res2.RData",

                                   descr = "Results of cross-validation of the amyloid data done for report3."),

                    amyloid_fold_res = list(file_name = "amyloid_fold_res.RData",

                                   descr = "Results of cross-validation of the amyloid data done for report2."),

                    amyloid_present_bib = list(file_name = "amyloid_present.bib",

                                   descr = "Bibliography for amyloid presentation."),

                    amyloid_seqs = list(file_name = "amyloid_seqs.RData",

                                   descr = "Created by amyloid_data_extraction.R and used in presentation."),

                    aoc = list(file_name = "aoc.R",

                                   descr = "Areas of the competence for classifiers."),

                    aoc_data = list(file_name = "aoc.RData",

                                   descr = "Areas of the competence for classifiers - data for presentation."),

                    bigger6 = list(file_name = "bigger6.csv",

                                   descr = "Important n-grams extracted from sequences longer than 6."),

                    bigram_analysis = list(file_name = "bigram_analysis.R",

                                   descr = "Analysis of bigrams for presentation."),

                    bigram_analysis_data = list(file_name = "bigram_analysis.RData",

                                   descr = "Data from analysis of bigrams for presentation."),

                    casualRF = list(file_name = "casualRF.R",

                                   descr = "Random forests on amyloid sequences without encoding.")

)

known_files_names <- sapply(known_files, function(i) i[["file_name"]])

#all files in repo that aren't in gitignore

repo_files <- sort(setdiff(list.files(), readLines(".gitignore")[- grep("^\\.", readLines(".gitignore"))]))

if(!all(repo_files %in% known_files_names)) {

  message(paste0("Michal, you slacker! File(s): ",

                 paste(repo_files[!repo_files %in% known_files_names], collapse = ", "), 

                 " do not have a proper description."))

  repo_files <- repo_files[repo_files %in% known_files_names]

}

sourced_files <- sapply(lapply(repo_files, function(single_file) {

  all_lines <- readLines(single_file)

  source_lines <- grep("^source\\(", all_lines, fixed = FALSE)

  file_names <- if(length(source_lines) > 0) {

    #source allows to call only one file, so maybe we can presume that

    #in single line we have only single source call

    #I doubt someone would be goung source("xyz"); source("zyx") and so on in a single line

    

    vapply(all_lines[source_lines], function(single_line)

      strsplit(strsplit(single_line, 'source("', fixed = TRUE)[[1]][[2]],

               '\"')[[1]][[1]], "a")

  } else {

    ""

  }

}), paste0, collapse = ", ")

files_table <- data.frame(repo_files, sapply(repo_files, function(i) known_files[[which(i == known_files_names)]][["descr"]]), sourced_files)

colnames(files_table) <- c("File name", "Description", "Sourced files")

print(xtable(files_table), type = "html", include.rownames = FALSE)

```

### Repository rules

1. Each file must have a desctiption in readme using the R code in *readme.Rmd*.

2. Use comments frequently.
ecosyste.ms

Data

Tools

Indexes

Applications

Experiments

Awesome

https://github.com/michbur/amyloid_ngram

Awesome Lists containing this project

README