Skip to content

Instantly share code, notes, and snippets.

@rolandleth
Created June 20, 2018 07:49
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
  • Save rolandleth/2dc971edc72f83a4eea6b2db523c529d to your computer and use it in GitHub Desktop.
Save rolandleth/2dc971edc72f83a4eea6b2db523c529d to your computer and use it in GitHub Desktop.
Extracting and converting your Twitter archive into simpler objects
// Blog post: https://rolandleth.com/extracting-and-parsing-tweets-from-your-twitter-archive
import Foundation
struct Tweet: Codable {
let text: String
let timestamp: String
}
enum Syntax {
case markdown
case html
case none
}
let syntax: Syntax = .markdown
let dataDetector = try! NSDataDetector(types: NSTextCheckingResult.CheckingType.link.rawValue)
let handleRegex = try! NSRegularExpression(pattern: "@[^.,:;?!'\"\\-()\\[\\]{} ]+", options: .caseInsensitive)
let file = Bundle.main.path(forResource: "tweets", ofType: "csv")!
let csv = try! CSV(name: file)
var rawTweets = csv.rows.filter {
let isRetweet = $0["retweeted_status_user_id"]?.isEmpty == false
|| $0["expanded_urls"]?.contains("https://twitter.com") == true
|| $0["expanded_urls"]?.contains("favd.net") == true
|| $0["text"]?.contains("via @") == true
|| $0["text"]?.contains("RT @") == true
|| $0["text"]?.contains("\"@") == true
|| $0["text"]?.contains("“@") == true
|| $0["text"] == "."
let isReply = $0["in_reply_to_status_id"]?.isEmpty == false
|| $0["text"]?.hasPrefix("@") == true
let isLinkToBlog = $0["expanded_urls"]?.contains("rolandleth.com") == true
return !isRetweet && !isReply && !isLinkToBlog
}
let tweets = try! rawTweets.map { rawTweet -> Tweet in
var text = rawTweet["text"]!
if syntax == .markdown {
text = text.replacingOccurrences(of: "\n", with: " \n")
}
var nsText: NSString { return text as NSString }
var textRange: NSRange { return NSRange(location: 0, length: text.utf16.count) }
let expandedURLs = rawTweet["expanded_urls"]!.components(separatedBy: ",")
let reversedMatches = dataDetector
.matches(in: text, options: [], range: textRange)
.reversed()
let matchesCount = reversedMatches.count
var nonTcoURLs = 0
reversedMatches.enumerated().forEach { i, m in
var url = nsText.substring(with: m.range)
let correctURL: String
if matchesCount > expandedURLs.count, !url.hasPrefix("http") {
url = "http://" + url
nonTcoURLs += 1
}
else {
url = expandedURLs[i - nonTcoURLs]
}
let urlName = url
.replacingOccurrences(of: "http://", with: "")
.replacingOccurrences(of: "https://", with: "")
switch syntax {
case .markdown: correctURL = "[\(urlName)](\(url))"
case .html: correctURL = "<a href=\"\(url)\">\(urlName)</a>"
case .none: correctURL = url
}
text = nsText.replacingCharacters(in: m.range, with: correctURL)
}
let reversedHandleMatches = handleRegex
.matches(in: text, options: [], range: textRange)
.reversed()
reversedHandleMatches.forEach {
let accountRange = NSRange(location: $0.range.location + 1, length: $0.range.length - 1)
let account = nsText.substring(with: accountRange)
let correctHandleURL: String
let handleURL = "https://twitter.com/\(account)"
switch syntax {
case .markdown: correctHandleURL = "[@\(account)](\(handleURL))"
case .html: correctHandleURL = "<a href=\"\(handleURL)\">@\(account)</a>"
case .none: correctHandleURL = handleURL
}
text = nsText.replacingCharacters(in: $0.range, with: correctHandleURL)
}
let t = rawTweet["timestamp"]!
.replacingOccurrences(of: " ", with: "-")
.replacingOccurrences(of: ":", with: "")
as NSString
let time = t.substring(with: NSRange(location: 0, length: 15))
return Tweet(text: text, timestamp: time)
}
// Blog post: https://rolandleth.com/extracting-and-parsing-tweets-from-your-twitter-archive
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment