Last active
April 23, 2019 06:52
-
-
Save samueleresca/74bf581877a5971ad5aded7a4a2e4bc1 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
open Microsoft.ML | |
open Microsoft.ML | |
open Microsoft.ML.Data | |
open Microsoft.ML.Transforms.Text | |
let stopwords = [|"ourselves"; "hers"; "between"; "yourself"; "but"; "again"; "there"; "about"; "once"; "during"; "out"; "very"; "having"; "with"; "they"; "own"; "an"; "be"; "some"; "for"; "do"; "its"; "yours"; "such"; "into"; "of"; "most"; "itself"; "other"; "off"; "is"; "s"; "am"; "or"; "who"; "as"; "from"; "him"; "each"; "the"; "themselves"; "until"; "below"; "are"; "we"; "these"; "your"; "his"; "through"; "don"; "nor"; "me"; "were"; "her"; "more"; "himself"; "this"; "down"; "should"; "our"; "their"; "while"; "above"; "both"; "up"; "to"; "ours"; "had"; "she"; "all"; "no"; "when"; "at"; "any"; "before"; "them"; "same"; "and"; "been"; "have"; "in"; "will"; "on"; "does"; "yourselves"; "then"; "that"; "because"; "what"; "over"; "why"; "so"; "can"; "did"; "not"; "now"; "under"; "he"; "you"; "herself"; "has"; "just"; "where"; "too"; "only"; "myself"; "which"; "those"; "i"; "after"; "few"; "whom"; "t";"ll"; "being"; "if"; "theirs"; "my"; "against"; "a"; "by"; "doing"; "it"; "how"; "further"; "was"; "here"; "than"; "s"; "t"; "m"; "'re"; "'ll";"ve";"..."; "ä±"; "''"; "``"; "--"; "'d"; "el"; "la"; "que"; "y"; "de"; "en"|] | |
let symbols = [|'\''; ' '; ','|] | |
let renderLineChartForWords(words: seq<string>) = | |
words | |
|> Seq.countBy id | |
|> Seq.sortByDescending(fun (value:string, count :int) -> count) | |
|> Seq.take 15 | |
|> Chart.Line | |
let tokenizeLyrics (lyrics: seq<LyricsInput>) = | |
let mlContext = MLContext(seed = Nullable 0) | |
let data = mlContext.Data.LoadFromEnumerable lyrics | |
let pipeline = mlContext.Transforms.Text.FeaturizeText("FeaturizedLyrics", "Lyrics") | |
.Append(mlContext.Transforms.Text.NormalizeText("NormalizedLyrics", "Lyrics")) | |
.Append(mlContext.Transforms.Text.TokenizeWords("TokenizedLyric", "NormalizedLyrics", symbols)) | |
.Append(mlContext.Transforms.Text.RemoveStopWords("LyricsWithNoCustomStopWords", "TokenizedLyric", stopwords)) | |
.Append(mlContext.Transforms.Text.RemoveDefaultStopWords("LyricsWithNoStopWords", "LyricsWithNoCustomStopWords")) | |
let transformedData = pipeline.Fit(data).Transform(data) | |
transformedData.GetColumn<string[]>(mlContext, "LyricsWithNoStopWords") | |
|> Seq.concat | |
|> Seq.toList |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment