Created
November 13, 2012 19:24
-
-
Save mattjj/4067817 to your computer and use it in GitHub Desktop.
Stanford TMT Example
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
val source = CSVFile("pubmed-oa-subset.csv") ~> IDColumn(1); | |
val tokenizer = { | |
SimpleEnglishTokenizer() ~> // tokenize on space and punctuation | |
CaseFolder() ~> // lowercase everything | |
WordsAndNumbersOnlyFilter() ~> // ignore non-words and non-numbers | |
MinimumLengthFilter(3) // take terms with >=3 characters | |
} | |
val text = { | |
source ~> // read from the source file | |
Column(4) ~> // select column containing text | |
TokenizeWith(tokenizer) ~> // tokenize with tokenizer above | |
TermCounter() ~> // collect counts (needed below) | |
TermMinimumDocumentCountFilter(4) ~> // filter terms in <4 docs | |
TermDynamicStopListFilter(30) ~> // filter out 30 most common terms | |
DocumentMinimumLengthFilter(5) // take only docs with >=5 terms | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment