Skip to content

Instantly share code, notes, and snippets.

View jamesthomson's full-sized avatar

James Thomson jamesthomson

View GitHub Profile
@jamesthomson
jamesthomson / lastfm_spark_rec_aws.py
Last active March 27, 2016 23:40
aws version of the lastfm recommendations in spark
#in terminal connect ot the master node
ssh hadoop@ec2-xx-xx-xxx-xxx.compute-1.amazonaws.com -i ~/aws_key_pair.pem
#then fire up spark
MASTER=yarn-client /home/hadoop/spark/bin/pyspark
lines = sc.textFile('s3n://jthomson/lastfm_listens/listens/usersha1-artmbid-artname-plays.tsv')
data = lines.map(lambda l: l.split('\t'))
ratings = data.map(lambda d: (d[0], d[2], 1))
users_lkp = ratings.map(lambda s: s[0]).distinct().zipWithUniqueId()
@jamesthomson
jamesthomson / python predict gender.py
Last active October 12, 2015 14:49
process to predict gender based on lastfm data
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
plays = pd.read_table("usersha1-artmbid-artname-plays-sample.tsv", usecols=[0, 2, 3], names=['user', 'artist', 'plays'])
users = pd.read_table("usersha1-profile-sample.tsv", usecols=[0, 1], names=['user', 'gender'])
users=users.dropna()
@jamesthomson
jamesthomson / pandas manip lastfm data.py
Last active August 29, 2015 14:22
use pandas to manipulate lastfm listening data into the format i need ready for modelling with sklearn
#import data
import pandas as pd
plays = pd.read_table("usersha1-artmbid-artname-plays-sample.tsv", usecols=[0, 2, 3], names=['user', 'artist', 'plays'])
users = pd.read_table("usersha1-profile-sample.tsv", usecols=[0, 1], names=['user', 'gender'])
#print plays.head()
#print users.head()
#clear people who don't know gender for
users=users.dropna()
#dummy code up gender
import numpy as np
from sklearn.datasets import load_iris
iris = load_iris()
X = iris.data
print X
#scale the data
from sklearn.preprocessing import StandardScaler
SS=StandardScaler()
XS=SS.fit_transform(X)
@jamesthomson
jamesthomson / import msd to dataframe.py
Created May 21, 2015 15:20
importing a million song dataset file and converting to a dataframe
import pandas as pd
#open and split file then convert to df
lines = [line.strip().split("\t") for line in open("P:\\A.tsv.a.txt", "r")]
df=pd.DataFrame(lines)
#pull out columns for further split
cols=range(18,22)+range(33,42)
arrays=df.loc[1:5,cols].values
#import all data add column headers and run checks
dist <- read.delim("~/Documents/my blog/million song database/7plus songs/output1.txt", header=FALSE)
colnames(dist)<-c('length', 'freq')
dist
dist_time <- read.csv("~/Documents/my blog/million song database/7plus songs/output2.txt", header=FALSE)
@jamesthomson
jamesthomson / msd_song_length_analysis.sql
Last active August 29, 2015 14:11
msd song length analysis
DROP TABLE IF EXISTS msd_data;
CREATE EXTERNAL TABLE msd_data
(
ref string,
analysis_sample_rate float ,
artist_7digitalid int ,
artist_familiarity float ,
@jamesthomson
jamesthomson / msd_files_list.R
Last active August 29, 2015 14:11
msd files list
library(XML)
#read url
search<-readLines('http://tbmmsd.s3.amazonaws.com/')
#convert to data.frame
df<-xmlToDataFrame(search)
#pull out files list
Files<-df$Key
#clean up NAs
Files2<-Files[!is.na(Files)]
#server.r
library(shiny)
shinyServer(function(input, output) {
library(rjson)
library(ggplot2)
library(grid)
# ui.R
library(shiny)
shinyUI(fluidPage(
titlePanel("Lastfm Dashboard"),
sidebarLayout(position ="left",
sidebarPanel(
img(src = "logo.png", height = 80, width = 80),