Skip to content

Instantly share code, notes, and snippets.

@jobliz
Created August 10, 2018 02:06
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jobliz/c035bf2245a5dbd35df4a0f95b222eba to your computer and use it in GitHub Desktop.
Save jobliz/c035bf2245a5dbd35df4a0f95b222eba to your computer and use it in GitHub Desktop.
Two step processing for loading the goodbooks-10k dataset into elasticsearch
import sys
import csv
from elasticsearch_dsl.connections import connections
from elasticsearch_dsl import DocType, Text, Date, Search
from elasticsearch import Elasticsearch
connections.create_connection(hosts=['localhost'], timeout=20)
es = Elasticsearch()
ess = Search(using=es)
ES_MEDIA_INDEX = 'babelcodex_test'
ES_MEDIA_TYPE = 'media'
ES_MEDIA_ID_FIELD = 'id'
bulk_data = []
with open('new_.csv', newline='') as csvfile:
reader = csv.reader(csvfile, delimiter=',')
for item in reader:
tag_names = item[2].split("|")
data_dict = {
'id': item[0],
'title': item[1],
'tags': tag_names
}
op_dict = {
"index": {
"_index": ES_MEDIA_INDEX,
"_type": ES_MEDIA_TYPE,
"_id": data_dict[ES_MEDIA_ID_FIELD]
}
}
bulk_data.append(op_dict)
bulk_data.append(data_dict)
request_body = {
"settings": {
"number_of_shards": 1,
"number_of_replicas": 0
}
}
es.indices.create(index=ES_MEDIA_INDEX, body=request_body)
es.bulk(index=ES_MEDIA_INDEX, body=bulk_data, refresh=True)
import os
import csv
import sys
import pandas as pd
from tqdm import tqdm
r = pd.read_csv('ratings.csv')
tr = pd.read_csv('to_read.csv')
b = pd.read_csv('books.csv')
t = pd.read_csv('tags.csv')
bt = pd.read_csv('book_tags.csv')
# Let us merge tag names into tag applications.
bt = bt.merge( t, on = 'tag_id' )
# Why don't we merge book titles for good measure.
bt = bt.merge( b[[ 'goodreads_book_id', 'title']], on = 'goodreads_book_id' )
# fix negative tag counts
bt.loc[ bt['count'] < 0, 'count'] = 0
print("Collecting tags from book_tags.csv")
book_tags = {}
with tqdm(total=len(bt)) as pbar:
for index, row in bt.iterrows():
if row['goodreads_book_id'] not in book_tags:
book_tags[row['goodreads_book_id']] = []
book_tags[row['goodreads_book_id']].append(row['tag_name'])
pbar.update(1)
print("Creating new CSV file")
with open('new_.csv', 'w', newline='') as csvfile:
writer = csv.writer(csvfile, delimiter=',')
with tqdm(total=len(b)) as pbar:
for index, row in b.iterrows():
tags = book_tags[row['goodreads_book_id']]
tag_string = '|'.join(tags)
writer.writerow([row['goodreads_book_id'], row['title'], tag_string])
pbar.update(1)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment