Skip to content

Instantly share code, notes, and snippets.

@chenyanzhe
Last active January 1, 2016 08:59
Show Gist options
  • Save chenyanzhe/8121434 to your computer and use it in GitHub Desktop.
Save chenyanzhe/8121434 to your computer and use it in GitHub Desktop.
Split all files in the corresponding folder into 4 smaller parts accordingly by lines.
#!/usr/bin/env python3
# Usage:
# .
# ├── amazon
# │ ├── amazon_aa
# │ ├── amazon_ab
# │ ├── amazon_ac
# | ...
# │ ├── amazon_bu
# │ ├── amazon_bv
# ├── split.py
# ├── hollywood
# │ ├── hollywood_aa
# │ ├── hollywood_ab
# │ ├── hollywood_ac
# ...
# │ ├── hollywood_bu
# │ └── hollywood_bv
# └── wiki
# ├── wiki_aa
# ├── wiki_ab
# ├── wiki_ac
# ...
# ├── wiki_bu
# └── wiki_bv
#
# When running this scrpit, it will split all data files into 4 smaller ones
# e.g. amazon_aa --> { amazon_aa_0, amazon_aa_26867, amazon_aa_53734, amazon_aa_80606 }
# the number appended at last is the starting line number of newly splitted file
import os
def split_file(filepath, lines_per_file=100):
"""Split the file based on a number of lines."""
lpf = lines_per_file
path, filename = os.path.split(filepath)
with open(filepath, 'r') as r:
basename = filename
try:
w = open(os.path.join(path, '{}_{}'.format(basename, 0)), 'w')
for i, line in enumerate(r):
if not i % lpf:
w.close()
filename = os.path.join(path, '{}_{}'.format(basename, i))
w = open(filename, 'w')
w.write(line)
finally:
w.close()
if __name__ == '__main__':
base, scriptname = os.path.split(os.path.abspath(__file__))
jobs = [ f for f in os.listdir(base) ]
jobs.remove(scriptname)
for job in jobs:
cwd = os.path.join(base, job)
files = [ f for f in os.listdir(cwd)]
for f in files:
num_lines = sum(1 for line in open(os.path.join(cwd, f)))
split_file(os.path.join(cwd, f), num_lines // 4 + 1)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment