Skip to content

Instantly share code, notes, and snippets.

@pkrnjevic
Last active March 10, 2016 03:20
Show Gist options
  • Save pkrnjevic/6e98bc711f0f213c59eb to your computer and use it in GitHub Desktop.
Save pkrnjevic/6e98bc711f0f213c59eb to your computer and use it in GitHub Desktop.
One way to concatenate a lot of files quickly (352k in 2m48sec) ... inspired by http://randyzwitch.com/gnu-parallel-medium-data/
// catfile.go
// one possible way to concatenate files quickly
// inspired by http://randyzwitch.com/gnu-parallel-medium-data/
// runtime on mb pro15 (late 2013): 352k files concatentaed in 2m48sec
//
package main
import (
"fmt"
"io/ioutil"
"os"
"path/filepath"
)
func walk(out *os.File) filepath.WalkFunc {
count := 0
return func(path string, info os.FileInfo, err error) error {
count++
fmt.Printf("%d\t%s\r", count, path)
if info.IsDir() {
return nil
}
dat, err := ioutil.ReadFile(path)
if err != nil {
panic(err)
}
_, err = out.Write(dat)
if err != nil {
panic(err)
}
return nil
}
}
func main() {
out, err := os.Create("output.txt")
defer out.Close()
if err != nil {
panic(err)
}
walk := walk(out)
err = filepath.Walk("transactions/", walk)
if err != nil {
panic(err)
}
fmt.Printf("\ndone\n")
}
// code from http://randyzwitch.com/gnu-parallel-medium-data/
// used to generate test data files
#Python Code
import random, csv
from faker import Faker
fake = Faker()
from pandas import DataFrame
import pandas as pd
# Create customer file of 1,234,567 customers with fake data
# Use dataframe index as a way to generate unique customer id
customers = [fake.simple_profile() for x in range(0,1234567)]
customer_df = pd.DataFrame(customers)
customer_df["cust_id"] = customer_df.index
#Read in transactions file from arules package
#with open("grocerydata.txt") as f:
with open("groceries.txt") as f:
transactions = f.readlines()
#Remove new line character
transactions = [x[0:-1] for x in transactions]
#Generate transactions by cust_id
#file format:
#cust_id::int
#store_id::int
#transaction_datetime::string/datetime
#items::string
#for each customer...
for i in range(0,1234567):
#...create a file...
# with open('./transactions/custfile_%s' % i, 'w') as csvfile:
with open('transactions/custfile_%s' % i, 'w') as csvfile:
trans = csv.writer(csvfile, delimiter=' ', quotechar='"', quoting=csv.QUOTE_MINIMAL)
#...that contains all of the transactions they've ever made
for j in range(1, random.randint(1,365)):
trans.writerow([i, fake.zipcode(), fake.date_time_this_decade(before_now=True, after_now=False), transactions[random.randint(0,len(transactions) - 1)]])
// code from http://randyzwitch.com/gnu-parallel-medium-data/
// used to generate test data files
#R Code
library(arules)
data("Groceries")
write(Groceries, "groceries.txt", sep = ",")
@pkrnjevic
Copy link
Author

One possible way to concatenate files quickly.
Inspired by Randy Zwitch's article A Million Text Files And A Single Laptop.
Runtime on (late 2013) mbpro15: 352k files concatenated in 2m48sec.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment