Skip to content

Instantly share code, notes, and snippets.

Created November 6, 2017 12:20
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save anonymous/e7d5873994a3c4bf2410268148c907ec to your computer and use it in GitHub Desktop.
Save anonymous/e7d5873994a3c4bf2410268148c907ec to your computer and use it in GitHub Desktop.
Processing BSON Files (Kaggle cdiscount competition)
import io
import bson # this is installed with the pymongo package
import matplotlib.pyplot as plt
import multiprocessing as mp # will come in handy due to the size of the data
from process import process
NCORE = 6
prod_to_category = mp.Manager().dict() # note the difference
q = mp.Queue(maxsize=NCORE)
iolock = mp.Lock()
pool = mp.Pool(NCORE, initializer=process, initargs=(q, iolock, prod_to_category))
# process the file
data = bson.decode_file_iter(open(r'C:\nbs\cdiscount\data\train_example.bson', 'rb'))
for c, d in enumerate(data):
q.put(d) # blocks until q below its max size
# tell workers we're done
for _ in range(NCORE):
q.put(None)
pool.close()
pool.join()
# convert back to normal dictionary
prod_to_category = dict(prod_to_category)
prod_to_category = pd.DataFrame.from_dict(prod_to_category, orient='index')
prod_to_category.index.name = '_id'
prod_to_category.rename(columns={0: 'category_id'}, inplace=True)
from skimage.data import imread # or, whatever image library you prefer
import io
def process(q, iolock, prod_to_category):
while True:
d = q.get()
if d is None:
break
product_id = d['_id']
category_id = d['category_id']
prod_to_category[product_id] = category_id
for e, pic in enumerate(d['imgs']):
picture = imread(io.BytesIO(pic['picture']))
# do something with the picture, etc
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment