Skip to content

Instantly share code, notes, and snippets.

@acheong08
Created September 29, 2023 20:24
Show Gist options
  • Save acheong08/894db3d4dffb7c577dd5773b9598ab95 to your computer and use it in GitHub Desktop.
Save acheong08/894db3d4dffb7c577dd5773b9598ab95 to your computer and use it in GitHub Desktop.
Convert Twitter Database Dump to SQLite
package main
import (
"bufio"
"database/sql"
"fmt"
"os"
"strings"
_ "github.com/mattn/go-sqlite3"
)
type UserData struct {
Email string
Name string
ScreenName string
Followers int
CreatedAt string
}
const batchSize = 1000
func main() {
// Read all files in the current directory
files, err := os.ReadDir(".")
if err != nil {
panic(err)
}
// Process each .txt file
for _, f := range files {
if strings.HasSuffix(f.Name(), ".txt") {
fmt.Printf("Processing file: %s\n", f.Name())
// Generate a database file name based on the text file name
dbFileName := fmt.Sprintf("%s.db", strings.TrimSuffix(f.Name(), ".txt"))
// Open SQLite database
db, err := sql.Open("sqlite3", dbFileName)
if err != nil {
panic(err)
}
// Create table if it doesn't exist
createTable(db)
// Open the file
file, err := os.Open(f.Name())
if err != nil {
fmt.Printf("Error opening file %s: %v\n", f.Name(), err)
continue
}
// Process the file and insert data into the database
processFile(db, file)
file.Close()
// Close the database
db.Close()
}
}
}
func createTable(db *sql.DB) {
query := `
CREATE TABLE IF NOT EXISTS users (
id INTEGER PRIMARY KEY AUTOINCREMENT,
email TEXT,
name TEXT,
screen_name TEXT,
followers INTEGER,
created_at TEXT
)
`
_, err := db.Exec(query)
if err != nil {
panic(err)
}
}
func processFile(db *sql.DB, file *os.File) {
scanner := bufio.NewScanner(file)
records := make([]UserData, 0)
for scanner.Scan() {
data := parseLine(scanner.Text())
records = append(records, data)
// If the batch size is reached, insert the records into the database
if len(records) == batchSize {
batchInsertIntoDatabase(db, records)
records = records[:0]
}
}
// Insert any remaining records
if len(records) > 0 {
batchInsertIntoDatabase(db, records)
}
}
func parseLine(line string) UserData {
parts := strings.Split(line, " - ")
var data UserData
for _, part := range parts {
kv := strings.SplitN(part, ": ", 2)
if len(kv) != 2 {
continue
}
key := kv[0]
value := kv[1]
switch key {
case "Email":
data.Email = value
case "Name":
data.Name = value
case "ScreenName":
data.ScreenName = value
case "Followers":
fmt.Sscanf(value, "%d", &data.Followers)
case "Created At":
data.CreatedAt = value
}
}
return data
}
func batchInsertIntoDatabase(db *sql.DB, records []UserData) {
tx, err := db.Begin()
if err != nil {
panic(err)
}
stmt, err := tx.Prepare("INSERT INTO users (email, name, screen_name, followers, created_at) VALUES (?, ?, ?, ?, ?)")
if err != nil {
panic(err)
}
defer stmt.Close()
for _, record := range records {
_, err := stmt.Exec(record.Email, record.Name, record.ScreenName, record.Followers, record.CreatedAt)
if err != nil {
fmt.Printf("Error inserting record: %v\n", err)
}
}
tx.Commit()
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment