James Thomson jamesthomson

## audio_signal_processing.py

#required libraries
import urllib
import scipy.io.wavfile
import pydub

#a temp folder for downloads
temp_folder="/Users/home/Desktop/"

#spotify mp3 sample file

## twitter_following_cleanup.R
library(twitteR)

#twitter authorisation
consumer_key = ''
consumer_secret = ''
access_token = ''
access_secret = ''
setup_twitter_oauth(consumer_key, consumer_secret, access_token, access_secret)

user<-getUser("inspirationinf")

## led zeppelin tracks.R
library(plotly)

#read in data
final<-read.csv("outputFile.txt", header=FALSE)[,-c(1,2)]
original<-read.csv("zep_tracks.csv", header=FALSE)

#join together
merged<-merge(original, final, by.x="V5", by.y="V7")

#grab key vars and fix labels

## zeppelin_audio_features.R
library(httr)

country = "GB"
albumType = "album"
artist="Led Zeppelin"

#get artist id
url <- paste0("https://api.spotify.com/v1/search?q=", sub(" ", "%20", artist), "&type=artist")
search <- content(GET(url))
search$artists$items[[1]]

## zeppelin_clustering.py

%pyspark
#read in datafile
data = sc.textFile('s3://bucket/clustering/zep_tracks.csv')
#read as rdd
import csv
rdd = data.mapPartitions(lambda x: csv.reader(x))
#convert to dataframe
dataframe = rdd.toDF(['artist','artist_id','album','album_id','track','track_id','track_number','track_length',
	'preview_url','danceability','energy','key','loudness','mode','speechiness','acousticness','instrumentalness',

## word2vec tweets example.py
import pandas as pd
import re
import numpy as np
import nltk
import gensim


#import data. contains identifier and tweet

tweets=pd.DataFrame.from_csv('tweets.txt', sep='\t', index_col=False)

## word2vec example.py
import nltk
import gensim


sample="""Renewed fighting has broken out in South Sudan between forces loyal to the president and vice-president. A reporter in the capital, Juba, told the BBC gunfire and large explosions could be heard all over the city; he said heavy artillery was being used. More than 200 people are reported to have died in clashes since Friday. The latest violence came hours after the UN Security Council called on the warring factions to immediately stop the fighting. In a unanimous statement, the council condemned the violence "in the strongest terms" and expressed "particular shock and outrage" at attacks on UN sites. It also called for additional peacekeepers to be sent to South Sudan.
Chinese media say two Chinese UN peacekeepers have now died in Juba. Several other peacekeepers have been injured, as well as a number of civilians who have been caught in crossfire. The latest round of violence erupted when troops loyal to President Salva Kiir and first Vice-President Riek Machar began sho

## entity_recognition_example.py
import nltk

#with open('sample.txt', 'r') as f:
#    sample = f.read()

#article taken from the bbc
sample="""Renewed fighting has broken out in South Sudan between forces loyal to the president and vice-president. A reporter in the capital, Juba, told the BBC gunfire and large explosions could be heard all over the city; he said heavy artillery was being used. More than 200 people are reported to have died in clashes since Friday. The latest violence came hours after the UN Security Council called on the warring factions to immediately stop the fighting. In a unanimous statement, the council condemned the violence "in the strongest terms" and expressed "particular shock and outrage" at attacks on UN sites. It also called for additional peacekeepers to be sent to South Sudan.
Chinese media say two Chinese UN peacekeepers have now died in Juba. Several other peacekeepers have been injured, as well as a number of civilians who have been caught in crossfire. The latest round of violence erupted when troops loy

## lastfm_spark_rec_local.py
#start a terminal at the folder where spark is installed
#in the command line run this to fire up a pyspark instance
./bin/pyspark

###########################
### LOADING IN THE DATA ###
###########################

#load in the file and examine
lines = sc.textFile('usersha1-artmbid-artname-plays.tsv')

## lastfm_spark_rec_aws.py
#in terminal connect ot the master node
ssh hadoop@ec2-xx-xx-xxx-xxx.compute-1.amazonaws.com -i ~/aws_key_pair.pem
#then fire up spark
MASTER=yarn-client /home/hadoop/spark/bin/pyspark


lines = sc.textFile('s3n://jthomson/lastfm_listens/listens/usersha1-artmbid-artname-plays.tsv')
data = lines.map(lambda l: l.split('\t'))
ratings = data.map(lambda d: (d[0], d[2], 1))
users_lkp = ratings.map(lambda s: s[0]).distinct().zipWithUniqueId()

	#required libraries
	import urllib
	import scipy.io.wavfile
	import pydub

	#a temp folder for downloads
	temp_folder="/Users/home/Desktop/"

	#spotify mp3 sample file
	library(twitteR)

	#twitter authorisation
	consumer_key = ''
	consumer_secret = ''
	access_token = ''
	access_secret = ''
	setup_twitter_oauth(consumer_key, consumer_secret, access_token, access_secret)

	user<-getUser("inspirationinf")
	library(plotly)

	#read in data
	final<-read.csv("outputFile.txt", header=FALSE)[,-c(1,2)]
	original<-read.csv("zep_tracks.csv", header=FALSE)

	#join together
	merged<-merge(original, final, by.x="V5", by.y="V7")

	#grab key vars and fix labels
	library(httr)

	country = "GB"
	albumType = "album"
	artist="Led Zeppelin"

	#get artist id
	url <- paste0("https://api.spotify.com/v1/search?q=", sub(" ", "%20", artist), "&type=artist")
	search <- content(GET(url))
	search$artists$items[[1]]

	%pyspark
	#read in datafile
	data = sc.textFile('s3://bucket/clustering/zep_tracks.csv')
	#read as rdd
	import csv
	rdd = data.mapPartitions(lambda x: csv.reader(x))
	#convert to dataframe
	dataframe = rdd.toDF(['artist','artist_id','album','album_id','track','track_id','track_number','track_length',
	'preview_url','danceability','energy','key','loudness','mode','speechiness','acousticness','instrumentalness',
	import pandas as pd
	import re
	import numpy as np
	import nltk
	import gensim


	#import data. contains identifier and tweet

	tweets=pd.DataFrame.from_csv('tweets.txt', sep='\t', index_col=False)
	import nltk
	import gensim


	sample="""Renewed fighting has broken out in South Sudan between forces loyal to the president and vice-president. A reporter in the capital, Juba, told the BBC gunfire and large explosions could be heard all over the city; he said heavy artillery was being used. More than 200 people are reported to have died in clashes since Friday. The latest violence came hours after the UN Security Council called on the warring factions to immediately stop the fighting. In a unanimous statement, the council condemned the violence "in the strongest terms" and expressed "particular shock and outrage" at attacks on UN sites. It also called for additional peacekeepers to be sent to South Sudan.
	Chinese media say two Chinese UN peacekeepers have now died in Juba. Several other peacekeepers have been injured, as well as a number of civilians who have been caught in crossfire. The latest round of violence erupted when troops loyal to President Salva Kiir and first Vice-President Riek Machar began sho
	import nltk

	#with open('sample.txt', 'r') as f:
	# sample = f.read()

	#article taken from the bbc
	sample="""Renewed fighting has broken out in South Sudan between forces loyal to the president and vice-president. A reporter in the capital, Juba, told the BBC gunfire and large explosions could be heard all over the city; he said heavy artillery was being used. More than 200 people are reported to have died in clashes since Friday. The latest violence came hours after the UN Security Council called on the warring factions to immediately stop the fighting. In a unanimous statement, the council condemned the violence "in the strongest terms" and expressed "particular shock and outrage" at attacks on UN sites. It also called for additional peacekeepers to be sent to South Sudan.
	Chinese media say two Chinese UN peacekeepers have now died in Juba. Several other peacekeepers have been injured, as well as a number of civilians who have been caught in crossfire. The latest round of violence erupted when troops loy
	#start a terminal at the folder where spark is installed
	#in the command line run this to fire up a pyspark instance
	./bin/pyspark

	###########################
	### LOADING IN THE DATA ###
	###########################

	#load in the file and examine
	lines = sc.textFile('usersha1-artmbid-artname-plays.tsv')
	#in terminal connect ot the master node
	ssh hadoop@ec2-xx-xx-xxx-xxx.compute-1.amazonaws.com -i ~/aws_key_pair.pem
	#then fire up spark
	MASTER=yarn-client /home/hadoop/spark/bin/pyspark


	lines = sc.textFile('s3n://jthomson/lastfm_listens/listens/usersha1-artmbid-artname-plays.tsv')
	data = lines.map(lambda l: l.split('\t'))
	ratings = data.map(lambda d: (d[0], d[2], 1))
	users_lkp = ratings.map(lambda s: s[0]).distinct().zipWithUniqueId()