-
-
Save karadza3a/3ad9b6343ba1b29bebd2bb95dd4270c1 to your computer and use it in GitHub Desktop.
Processing BSON Files (Kaggle cdiscount competition)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import io | |
import bson # this is installed with the pymongo package | |
import matplotlib.pyplot as plt | |
import multiprocessing as mp # will come in handy due to the size of the data | |
from process import process | |
NCORE = 6 | |
prod_to_category = mp.Manager().dict() # note the difference | |
q = mp.Queue(maxsize=NCORE) | |
iolock = mp.Lock() | |
pool = mp.Pool(NCORE, initializer=process, initargs=(q, iolock, prod_to_category)) | |
# process the file | |
data = bson.decode_file_iter(open(r'C:\nbs\cdiscount\data\train_example.bson', 'rb')) | |
for c, d in enumerate(data): | |
q.put(d) # blocks until q below its max size | |
# tell workers we're done | |
for _ in range(NCORE): | |
q.put(None) | |
pool.close() | |
pool.join() | |
# convert back to normal dictionary | |
prod_to_category = dict(prod_to_category) | |
prod_to_category = pd.DataFrame.from_dict(prod_to_category, orient='index') | |
prod_to_category.index.name = '_id' | |
prod_to_category.rename(columns={0: 'category_id'}, inplace=True) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from skimage.data import imread # or, whatever image library you prefer | |
import io | |
def process(q, iolock, prod_to_category): | |
while True: | |
d = q.get() | |
if d is None: | |
break | |
product_id = d['_id'] | |
category_id = d['category_id'] | |
prod_to_category[product_id] = category_id | |
for e, pic in enumerate(d['imgs']): | |
picture = imread(io.BytesIO(pic['picture'])) | |
# do something with the picture, etc |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment