-
-
Save extratone/b6754da920c447b71ffa34bc88b9bf07 to your computer and use it in GitHub Desktop.
Extracting and converting your Twitter archive into simpler objects
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Blog post: https://rolandleth.com/extracting-and-parsing-tweets-from-your-twitter-archive | |
import Foundation | |
struct Tweet: Codable { | |
let text: String | |
let timestamp: String | |
} | |
enum Syntax { | |
case markdown | |
case html | |
case none | |
} | |
let syntax: Syntax = .markdown | |
let dataDetector = try! NSDataDetector(types: NSTextCheckingResult.CheckingType.link.rawValue) | |
let handleRegex = try! NSRegularExpression(pattern: "@[^.,:;?!'\"\\-()\\[\\]{} ]+", options: .caseInsensitive) | |
let file = Bundle.main.path(forResource: "tweets", ofType: "csv")! | |
let csv = try! CSV(name: file) | |
var rawTweets = csv.rows.filter { | |
let isRetweet = $0["retweeted_status_user_id"]?.isEmpty == false | |
|| $0["expanded_urls"]?.contains("https://twitter.com") == true | |
|| $0["expanded_urls"]?.contains("favd.net") == true | |
|| $0["text"]?.contains("via @") == true | |
|| $0["text"]?.contains("RT @") == true | |
|| $0["text"]?.contains("\"@") == true | |
|| $0["text"]?.contains("“@") == true | |
|| $0["text"] == "." | |
let isReply = $0["in_reply_to_status_id"]?.isEmpty == false | |
|| $0["text"]?.hasPrefix("@") == true | |
let isLinkToBlog = $0["expanded_urls"]?.contains("rolandleth.com") == true | |
return !isRetweet && !isReply && !isLinkToBlog | |
} | |
let tweets = try! rawTweets.map { rawTweet -> Tweet in | |
var text = rawTweet["text"]! | |
if syntax == .markdown { | |
text = text.replacingOccurrences(of: "\n", with: " \n") | |
} | |
var nsText: NSString { return text as NSString } | |
var textRange: NSRange { return NSRange(location: 0, length: text.utf16.count) } | |
let expandedURLs = rawTweet["expanded_urls"]!.components(separatedBy: ",") | |
let reversedMatches = dataDetector | |
.matches(in: text, options: [], range: textRange) | |
.reversed() | |
let matchesCount = reversedMatches.count | |
var nonTcoURLs = 0 | |
reversedMatches.enumerated().forEach { i, m in | |
var url = nsText.substring(with: m.range) | |
let correctURL: String | |
if matchesCount > expandedURLs.count, !url.hasPrefix("http") { | |
url = "http://" + url | |
nonTcoURLs += 1 | |
} | |
else { | |
url = expandedURLs[i - nonTcoURLs] | |
} | |
let urlName = url | |
.replacingOccurrences(of: "http://", with: "") | |
.replacingOccurrences(of: "https://", with: "") | |
switch syntax { | |
case .markdown: correctURL = "[\(urlName)](\(url))" | |
case .html: correctURL = "<a href=\"\(url)\">\(urlName)</a>" | |
case .none: correctURL = url | |
} | |
text = nsText.replacingCharacters(in: m.range, with: correctURL) | |
} | |
let reversedHandleMatches = handleRegex | |
.matches(in: text, options: [], range: textRange) | |
.reversed() | |
reversedHandleMatches.forEach { | |
let accountRange = NSRange(location: $0.range.location + 1, length: $0.range.length - 1) | |
let account = nsText.substring(with: accountRange) | |
let correctHandleURL: String | |
let handleURL = "https://twitter.com/\(account)" | |
switch syntax { | |
case .markdown: correctHandleURL = "[@\(account)](\(handleURL))" | |
case .html: correctHandleURL = "<a href=\"\(handleURL)\">@\(account)</a>" | |
case .none: correctHandleURL = handleURL | |
} | |
text = nsText.replacingCharacters(in: $0.range, with: correctHandleURL) | |
} | |
let t = rawTweet["timestamp"]! | |
.replacingOccurrences(of: " ", with: "-") | |
.replacingOccurrences(of: ":", with: "") | |
as NSString | |
let time = t.substring(with: NSRange(location: 0, length: 15)) | |
return Tweet(text: text, timestamp: time) | |
} | |
// Blog post: https://rolandleth.com/extracting-and-parsing-tweets-from-your-twitter-archive |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment