Skip to content

Instantly share code, notes, and snippets.

@jkuruzovich
Last active October 27, 2020 16:28
Show Gist options
  • Save jkuruzovich/142e10b56e49f95f2b57acd1bd55b0b8 to your computer and use it in GitHub Desktop.
Save jkuruzovich/142e10b56e49f95f2b57acd1bd55b0b8 to your computer and use it in GitHub Desktop.
This will loop through a set of data files, creating samples of each file.
# This will loop through a set of data files, creating samples of each file.
import os, sys, importlib, glob
import numpy as np
from pathlib import Path
import pandas as pd
sys.path.append(os.path.join(Path.cwd().parent, 'modules'))
import mongoparser as mp
importlib.reload(mp)
cwd_dir = Path.cwd() #For running locally
base_dir = cwd_dir.parent
data_dir = Path('/Volumes/fusion/data/crunchbase/2018/mongo_queries/production/csv/')
sample_dir = base_dir / 'data' / 'sample'/ 'csv'
def save_multiple_files(data_path, sample_path, dir='sample', extension='*.csv', samples=[1000, 10000]):
files=glob.glob(str(data_path / extension))
names = [file.split('/')[-1].split('.')[0] for file in files]
Path(sample_path).mkdir(parents=True, exist_ok=True)
max_rows=np.max(samples)
for i in range(len(files)):
df=pd.read_csv(files[i], nrows=max_rows)
for sample in samples:
filename=(names[i]+str(sample)+'.csv')
print("Saving: ",filename)
df.iloc[0:sample,].to_csv(sample_path / filename, index=False)
save_multiple_files(data_dir,sample_dir)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment