Skip to content

Instantly share code, notes, and snippets.

@andrewgdunn
Created December 7, 2014 16:57
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save andrewgdunn/f9bf8aa7935ed82ac3a3 to your computer and use it in GitHub Desktop.
Save andrewgdunn/f9bf8aa7935ed82ac3a3 to your computer and use it in GitHub Desktop.
import pandas as pd
raw_df = pd.read_csv('reddit.r.scotch.tsv', sep='\t')
# Rid ourselves of data we don't need for the recommendation
raw_df.drop('Timestamp', axis=1, inplace=True)
raw_df.drop('Link To reddit Review', axis=1, inplace=True)
raw_df.drop('Region', axis=1, inplace=True)
raw_df.drop('Price', axis=1, inplace=True)
raw_df.drop('Date', axis=1, inplace=True)
# single word column names
raw_df.rename(columns={'Whisky Name': 'whisky', 'Reviewer Username': 'user', 'Rating': 'rating'}, inplace=True)
# Drop all rows that have NaN for the rating
raw_df = raw_df.dropna(subset=['rating'])
# what are the unique names for both whisky, and users
whisky_names = raw_df['whisky'].unique()
user_names = raw_df['user'].unique()
whisky_df = pd.DataFrame(columns={'index', 'name'})
user_df = pd.DataFrame(columns={'index', 'name'})
for index, name in enumerate(whisky_names):
raw_df.replace(name, index, inplace=True)
whisky_df.loc[len(whisky_df)+1] = [index, name]
for index, name in enumerate(user_names):
raw_df.replace(name, index, inplace=True)
user_df.loc[len(user_df)+1] = [index, name]
raw_df.to_csv('raw.tsv', sep='\t')
whisky_df.to_csv('whisky_names.tsv', sep='\t')
user_df.to_csv('user_names.tsv', sep='\t')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment