Skip to content

Instantly share code, notes, and snippets.

@davidglassborow
Forked from spacedoom/tweetwordextractor.fsx
Created November 16, 2015 13:13
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save davidglassborow/5ee3fb8bd2233b668957 to your computer and use it in GitHub Desktop.
Save davidglassborow/5ee3fb8bd2233b668957 to your computer and use it in GitHub Desktop.
A simple F# script that extracts words from tweets, given a #tag and a date range. Use this script in order to extract words from tweets, given a #hashtag and a date range. Excellent for creating e.g. word clouds! NB You also have to get a copy of the stopwords.txt file (or create your own). Disclaimer: This is not a finished product. There's ro…
#I @"packages\FSharp.Data.Toolbox.Twitter.0.6\lib\net40"
#I @"packages\FSharp.Data.2.1.1\lib\net40"
#r @".\packages\FSharp.Data.Toolbox.Twitter.0.6\lib\net40\FSharp.Data.Toolbox.Twitter.dll"
#r @".\packages\FSharp.Data.2.1.1\lib\net40\FSharp.Data.dll"
open FSharp.Data.Toolbox.Twitter
// Check out https://apps.twitter.com for more info on getting key & secret!
let key = "" //Insert key here!
let secret = "" //Insert secret here!
let twitter = Twitter.AuthenticateAppOnly(key, secret)
open System
open System.IO
let stopWords = File.ReadAllLines(__SOURCE_DIRECTORY__ + "\\StopWords.txt")
let words (text:string) = text.Split([|'#';'.';',';';';':';'!';'?';'`';' ';'\r';'\n';'"';'\'';'“';'”';'(';')';'+';'-'|], StringSplitOptions.RemoveEmptyEntries)
let rec getWords id =
let ndc =
match id with
| Some id -> twitter.Search.Tweets("#ndcoslo since:2015-06-15",count=100,maxId=id)
| None -> twitter.Search.Tweets("#ndcoslo since:2015-06-15",count=100)
let statuses = ndc.Statuses
let words =
statuses
|> Seq.filter (fun tweet -> tweet.RetweetedStatus.IsSome |> not)
|> Seq.filter (fun tweet -> tweet.Text.StartsWith("RT") |> not)
|> Seq.map (fun tweet -> tweet.Text)
|> String.concat " "
|> words
|> Seq.map (fun s -> s.ToLower())
|> Seq.filter (fun word -> word <> "ndcoslo")
|> Seq.filter (fun word -> word.StartsWith("http") |> not)
|> Seq.filter (fun word -> word.StartsWith("co/") |> not)
|> Seq.filter (fun word -> word.StartsWith("//t") |> not)
|> Seq.filter (fun word -> word.StartsWith("@") |> not)
|> Seq.filter (fun word -> word <> "rt")
|> Seq.filter (fun word -> word <> "ht")
|> Seq.filter (fun word -> word <> "co")
|> Seq.filter (fun word -> word <> "cc")
|> Seq.filter (fun word -> word <> "yo")
|> Seq.filter (fun word -> word <> "bit")
|> Seq.filter (fun word -> word <> "didn")
|> Seq.filter (fun word -> word <> "don")
|> Seq.filter (fun word -> word <> "isn")
|> Seq.filter (fun word -> word <> "htt")
|> Seq.filter (fun word -> word <> "&amp")
|> Seq.filter (fun word -> word.Length > 2)
|> Seq.filter (fun word -> word.EndsWith("…") |> not)
|> Seq.filter (fun word -> word.EndsWith("%") |> not)
|> Seq.filter (fun word -> let x,_ = Int32.TryParse(word) in not x)
|> Seq.filter (fun word -> stopWords |> Array.exists ((=) word) |> not)
|> Seq.toList
if statuses |> Seq.length > 0 then
let path = String.Format("C:/temp/ndcoslo{0}.txt", DateTime.Now.Ticks)
printf "%s" path
File.WriteAllText(path, words |> String.concat " ")
let oldestTweet = statuses |> Seq.minBy(fun s -> s.Id)
printfn "%A" oldestTweet.CreatedAt
printfn "%A" oldestTweet.Text
words @ getWords (Some ((oldestTweet).Id - (int64)1))
else []
let allWords = getWords None
let ndcoslo = allWords |> String.concat " "
File.WriteAllText(@"C:\temp\ndcoslo.txt", ndcoslo)
let myCounts =
allWords
|> Seq.countBy id
|> Seq.sortBy (fun (_, count) -> -count)
|> Seq.take 50
|> Seq.toArray
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment