Skip to content

Instantly share code, notes, and snippets.

View jamesthomson's full-sized avatar

James Thomson jamesthomson

View GitHub Profile
@jamesthomson
jamesthomson / python predict gender.py
Last active October 12, 2015 14:49
process to predict gender based on lastfm data
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
plays = pd.read_table("usersha1-artmbid-artname-plays-sample.tsv", usecols=[0, 2, 3], names=['user', 'artist', 'plays'])
users = pd.read_table("usersha1-profile-sample.tsv", usecols=[0, 1], names=['user', 'gender'])
users=users.dropna()
@jamesthomson
jamesthomson / pandas manip lastfm data.py
Last active August 29, 2015 14:22
use pandas to manipulate lastfm listening data into the format i need ready for modelling with sklearn
#import data
import pandas as pd
plays = pd.read_table("usersha1-artmbid-artname-plays-sample.tsv", usecols=[0, 2, 3], names=['user', 'artist', 'plays'])
users = pd.read_table("usersha1-profile-sample.tsv", usecols=[0, 1], names=['user', 'gender'])
#print plays.head()
#print users.head()
#clear people who don't know gender for
users=users.dropna()
#dummy code up gender
import numpy as np
from sklearn.datasets import load_iris
iris = load_iris()
X = iris.data
print X
#scale the data
from sklearn.preprocessing import StandardScaler
SS=StandardScaler()
XS=SS.fit_transform(X)
@jamesthomson
jamesthomson / import msd to dataframe.py
Created May 21, 2015 15:20
importing a million song dataset file and converting to a dataframe
import pandas as pd
#open and split file then convert to df
lines = [line.strip().split("\t") for line in open("P:\\A.tsv.a.txt", "r")]
df=pd.DataFrame(lines)
#pull out columns for further split
cols=range(18,22)+range(33,42)
arrays=df.loc[1:5,cols].values
#import all data add column headers and run checks
dist <- read.delim("~/Documents/my blog/million song database/7plus songs/output1.txt", header=FALSE)
colnames(dist)<-c('length', 'freq')
dist
dist_time <- read.csv("~/Documents/my blog/million song database/7plus songs/output2.txt", header=FALSE)
@jamesthomson
jamesthomson / msd_song_length_analysis.sql
Last active August 29, 2015 14:11
msd song length analysis
DROP TABLE IF EXISTS msd_data;
CREATE EXTERNAL TABLE msd_data
(
ref string,
analysis_sample_rate float ,
artist_7digitalid int ,
artist_familiarity float ,
@jamesthomson
jamesthomson / msd_files_list.R
Last active August 29, 2015 14:11
msd files list
library(XML)
#read url
search<-readLines('http://tbmmsd.s3.amazonaws.com/')
#convert to data.frame
df<-xmlToDataFrame(search)
#pull out files list
Files<-df$Key
#clean up NAs
Files2<-Files[!is.na(Files)]
#server.r
library(shiny)
shinyServer(function(input, output) {
library(rjson)
library(ggplot2)
library(grid)
# ui.R
library(shiny)
shinyUI(fluidPage(
titlePanel("Lastfm Dashboard"),
sidebarLayout(position ="left",
sidebarPanel(
img(src = "logo.png", height = 80, width = 80),
library(nnet)
library(ggplot2)
neuralNetScoreTime<-function(var, score){
modelset<-NULL
for (i in 1:var) {
eval(parse(text=paste0("temp.", i, "<-rnorm(150, mean=1, sd=1)")))
eval(parse(text=paste0("modelset<-cbind(modelset, temp.", i, ")")))