Skip to content

Instantly share code, notes, and snippets.

@NickTalavera NickTalavera/Combining the Data.R Secret

Last active Nov 24, 2016
Embed
What would you like to do?
Xbox 360 Backwards Compatability Predictor
generousNameMerger = function(dataX,dataY,mergeType="all",keepName = "x") {
dataList = list(dataX, dataY)
datasWNameModded = foreach(i=1:length(dataList)) %dopar% {
datasOut = dataList[[i]]
datasOut$gameName = as.character(datasOut$gameName)
datasOut$NameModded = tolower(datasOut$gameName)
lastWords = as.integer(stringr::str_trim(stringr::str_extract(datasOut$NameModded,pattern="[0-9]+")))
lastWords = as.character(as.roman(lastWords))
datasOut$NameModded[!is.na(lastWords)] = stringr::str_replace(datasOut$NameModded[!is.na(lastWords)], replacement = lastWords[!is.na(lastWords)], pattern = "[0-9]+")
removeWords = tolower(c("[^a-zA-Z0-9]"," ","Remastered","Videogame","WWE","EA*SPORTS","Soccer","&","","®","DVD$","of","DX","disney","Deluxe","Complete","Ultimate","Encore","definitive","for","edition","standard","special","game", "the","Gold","Legendary\\S","Base*Game","free*to*play","full*game", "year","hd","movie","TM","Cabela\"s","and"," x$","s$"))
for (i in removeWords) {
datasOut$NameModded = gsub(i, "", datasOut$NameModded, ignore.case = TRUE)
}
datasOut$NameModded[datasOut$NameModded == "" & !is.na(datasOut$gameName)] = datasOut$gameName
return(datasOut)
}
dataX = datasWNameModded[[1]]
dataY = datasWNameModded[[2]]
...
data = merge(x = dataX, y = dataY, by = "NameModded", all = TRUE)
...
data = gameRemover(data)
return (data)
}
fixUserVoice = function(data) {
data$in_progress[data$in_progress == 'In-Progress'] = TRUE #Games that have been marked as in-progress are stored in a new column
data$userVoiceClosed[data$in_progress == 'Closed'] = TRUE #Games that have been marked closed are stored in a new column
data$isOnUserVoice = TRUE #Make a column to mark if the game was found on UserVoice
return(unique(data))
}
fixXbox360_MS_Site = function(data) {
data[data == ""] = NA #Turn empty quotes into a proper missing value
data = data[!is.na(data$ESRBRating) & tolower(data$ESRBRating) != tolower('RP (Rating Pending)') & data$numberOfReviews != 0,] #Remove games that were never released
data$genre = gsub(".*Other,|\\,.*","",data$genre, ignore.case = TRUE) #Remove "Other" if the genre list is longer
data$numberOfReviews = as.numeric(gsub(pattern = ",", replacement = "", x = data$numberOfReviews,ignore.case = TRUE)) #Strip commas from reviews to make the number numeric
data$releaseDate = as.character(as.Date(data$releaseDate, format = "%m/%d/%Y")) #Convert the data to be readable by R
data$hasDemoAvailable[data$DLdemos>0] = TRUE #If there are demos, mark hasDemoAvailable to be true
data$isAvailableToPurchaseDigitally[data$gameCount >= 1] = TRUE #If a game was found for sale, mark as available to download
data$isListedOnMSSite = TRUE #Mark as available on Microsoft's site
data = dplyr::select(data, -DLdemos, -features, -gameCount) #Toss unneeded variables
return(unique(data))
}
#===============================================================================
# LIBRARIES #
#===============================================================================
library(stringr)
library(Hmisc)
library(stringi)
library(dplyr)
library(DataCombine)
library(data.table)
library(randomForest)
#===============================================================================
# SETUP PARALLEL #
#===============================================================================
library(foreach)
library(parallel)
library(doParallel)
cores.Number = detectCores(all.tests = FALSE, logical = TRUE)
cl <- makeCluster(2)
registerDoParallel(cl, cores=cores.Number)
class Major_Nelson_Blog_BC_List_Spider(scrapy.Spider):
name = "Major_Nelson_Blog_BC_List"
allowed_domains = ["majornelson.com"]
start_urls = (
'https://majornelson.com/blog/xbox-one-backward-compatibility/',
)
def parse(self, response):
rows_in_big_table = response.xpath('//*[@id="post-20954"]/div/div/table/tbody/tr')
for i, onerow in enumerate(rows_in_big_table):
xb360_Ex_item = MajorNelsonItem()
gameName = onerow.xpath('td[1]/a/text()').extract()
if len(gameName) > 0:
gameName = gameName[0].strip()
else:
continue
xb360_Ex_item['gameName'] = gameName
xb360_Ex_item['BCCompatible'] = 'TRUE'
yield xb360_Ex_item
class MetacriticXbox360(scrapy.Spider):
name = "MetacriticXbox360"
allowed_domains = ["metacritic.com"]
start_urls = (
'http://www.metacritic.com/browse/games/score/metascore/all/xbox360/all?hardware=all&page=0',
)
def parse(self, response):
numberOfPages = int(response.xpath('//*[@id="main"]/div[1]/div[2]/div/div[2]/ul/li[10]/a/text()').extract()[0])
for j in range(0,numberOfPages):
next_page = 'http://www.metacritic.com/browse/games/score/metascore/all/xbox360/all?hardware=all&page=' + str(j)
yield scrapy.Request(next_page, callback=self.metacriticX360Find)
def metacriticX360Find(self, response):
rows_in_big_table = response.xpath('//*[@id="main"]/div[1]/div[1]/div[2]/div[3]/div/div/div')
print(rows_in_big_table.extract())
for i, onerow in enumerate(rows_in_big_table):
metacriticGameItem = MetacriticXbox360Item()
gameName = onerow.xpath('div[3]/a/text()').extract()[0].strip()
reviewScorePro = onerow.xpath('div[2]/div/text()').extract()[0].strip()
reviewScoreUser = onerow.xpath('div[4]/span[2]/text()').extract()[0].strip()
if reviewScoreUser== 'tbd':
reviewScoreUser = ''
metacriticGameItem['gameName'] = gameName
metacriticGameItem['reviewScorePro'] = reviewScorePro
metacriticGameItem['reviewScoreUser'] = reviewScoreUser
yield metacriticGameItem
class Remasters(scrapy.Spider):
name = "Remasters"
allowed_domains = ["gameinformer.com"]
start_urls = (
'http://www.gameinformer.com/themes/blogs/generic/post.aspx?WeblogApp=features&y=2016&m=05&d=16&WeblogPostName=definitive-evolving-list-new-gen-remaster-hd-remake-&PostPageIndex=1',
)
def parse(self, response):
base_link = 'http://www.gameinformer.com'
print "=" * 50
numberOfPages = int(response.xpath('//*[@id="divRenderBody"]/div/@rel').extract()[0])
print "=" * 50
for j in range(1,numberOfPages+1):
next_page = 'http://www.gameinformer.com/themes/blogs/generic/post.aspx?WeblogApp=features&y=2016&m=05&d=16&WeblogPostName=definitive-evolving-list-new-gen-remaster-hd-remake-&PostPageIndex=' + str(j)
print("Page" + str(j))
yield scrapy.Request(next_page, callback=self.remasterFind)
def remasterFind(self, response):
gameNames = response.xpath('//*[@id="divRenderBody"]/div[1]/div/p/strong/text()').extract()
print(range(0,len(gameNames)))
for i in range(0,len(gameNames)):
remasterItem = RemastersItem()
gameName = gameNames[i]
remasterItem['gameName'] = gameName
print "=" * 50
yield remasterItem
class UserVoice(scrapy.Spider):
name = "UserVoice"
allowed_domains = ["xbox.uservoice.com"]
start_urls = (
'https://xbox.uservoice.com/forums/298503-backwards-compatibility?filter=top&page=1',
'https://xbox.uservoice.com/forums/298503-backwards-compatibility/status/1222799?page=1',
'https://xbox.uservoice.com/forums/298503-backwards-compatibility/status/1222800?page=1'
)
def parse(self, response):
base_link = 'http://www.xbox.uservoice.com'
numberOfPages = int(response.xpath("/html/body/div[2]/div/div/div[1]/article/section[3]/div[2]/a/text()")[-2].extract())
for j in range(1,numberOfPages+1):
next_page = str(response.request.url)[0:len(response.request.url)-1] + str(j)
yield scrapy.Request(next_page, callback=self.userVoiceFind)
def userVoiceFind(self, response):
rows_in_big_table = response.xpath("/html/body/div[2]/div/div/div[1]/article/section[3]/ol/li")
for i, onerow in enumerate(rows_in_big_table):
user_voice_item = UserVoiceItem()
gameName = onerow.xpath('div[1]/h2/a/text()')
if len(gameName) != 0:
gameName = gameName[0].extract()
votes = onerow.xpath('div[2]/div[1]/strong/text()')
if len(votes) != 0:
votes = ''.join(re.findall('\d+',votes[0].extract()))
comments = onerow.xpath('div[3]/a/text()')
if len(comments) != 0:
comments = ''.join(re.findall('\d+',comments[0].extract()))
in_progress = onerow.xpath('article/div[1]/a/em/text()')
if len(in_progress) != 0:
in_progress = in_progress[0].extract()
user_voice_item['gameName'] = gameName
user_voice_item['comments'] = comments
user_voice_item['votes'] = votes
user_voice_item['in_progress'] = in_progress
yield user_voice_item
class WikipediaXB360Exclusive(BaseSpider):
name = "WikipediaXB360Exclusive"
allowed_domains = ['en.wikipedia.org']
start_urls = (
"https://en.wikipedia.org/wiki/List_of_video_game_exclusives_(seventh_generation)",
)
def parse(self, response):
base_link = 'https://en.wikipedia.org'
rows_in_big_table = response.xpath('//*[@id="mw-content-text"]/table[4]/tr')
for i, onerow in enumerate(rows_in_big_table):
WXB360ExclusiveItem = WikipediaXB360ExclusiveItem()
gameName = onerow.xpath('td/i/a/text()')
if len(gameName) != 0:
gameName = gameName[0].extract()
publisher = onerow.xpath('td[3]/a[1]/text()')
if len(publisher) != 0:
publisher = publisher[0].extract()
releaseDate = onerow.xpath('td[5]/span[1]/text()')
if len(releaseDate) != 0:
releaseDate = releaseDate[0].extract()[8:18]
exclusiveType = onerow.xpath('td[4]/text()')
if len(exclusiveType) != 0:
exclusiveType = exclusiveType[0].extract()
WXB360ExclusiveItem['gameName'] = gameName
WXB360ExclusiveItem['publisher'] = publisher
WXB360ExclusiveItem['releaseDate'] = releaseDate
WXB360ExclusiveItem['exclusiveType'] = exclusiveType
yield WXB360ExclusiveItem
class WikipediaXB360Kinect(BaseSpider):
name = "WikipediaXB360Kinect"
allowed_domains = ['en.wikipedia.org']
start_urls = (
"https://en.wikipedia.org/wiki/List_of_Kinect_games_for_Xbox_360",
)
def parse(self, response):
base_link = 'https://en.wikipedia.org'
rows_in_big_table = response.xpath('//*[@id="mw-content-text"]/table/tr')
for i, onerow in enumerate(rows_in_big_table):
WXB360KinectItem = WikipediaXB360KinectItem()
gameName = onerow.xpath('td/i/a/text()')
if len(gameName) != 0:
gameName = gameName[0].extract()
if len(gameName) == 0:
continue
publisher = onerow.xpath('td[3]/a/text()')
if len(publisher) != 0:
publisher = publisher[0].extract()
releaseDate = onerow.xpath('td/span[1]/text()')
if len(releaseDate) != 0:
releaseDate = releaseDate[0].extract()[8:18]
kinectRequired = onerow.xpath('td[9]/text()')
if len(kinectRequired) != 0:
kinectRequired = kinectRequired[0].extract()
kinectSupport = 'TRUE'
WXB360KinectItem['gameName'] = gameName
WXB360KinectItem['publisher'] = publisher
WXB360KinectItem['releaseDate'] = releaseDate
WXB360KinectItem['kinectRequired'] = kinectRequired
WXB360KinectItem['kinectSupport'] = kinectSupport
yield WXB360KinectItem
class Xbox360_MS_Site(scrapy.Spider):
name = "Xbox360_MS_Site"
allowed_domains = ['marketplace.xbox.com']
start_urls = (
'http://marketplace.xbox.com/en-US/Games/XboxArcadeGames?SortBy=BestSelling&PageSize=90&Page=1',
'http://marketplace.xbox.com/en-US/Games/GamesOnDemand?pagesize=90&sortby=BestSelling&Page=1',
'https://marketplace.xbox.com/en-US/Games/Xbox360Games?pagesize=90&sortby=BestSelling&page=1',
)
def parse(self, response):
numberOfPages = response.xpath('//*[@id="BodyContent"]/div[3]/div[2]/div[1]/text()').extract()[0]
numberOfPages = re.sub(",","",numberOfPages)
numberOfPages = re.findall('[0-9.]+',numberOfPages)[-1]
numberOfPages = int(math.ceil(float(re.findall("[0-9]+", numberOfPages)[-1])/90))
for j in range(1,numberOfPages+1):
next_page = str(response.request.url)[0:len(response.request.url)-1] + str(j)
yield scrapy.Request(next_page, callback=self.xbPageFind)
def xbPageFind(self, response):
baseURL = "http://marketplace.xbox.com"
rows_in_big_table = response.xpath('//*[@id="BodyContent"]/div[3]/ol/li')
for i, onerow in enumerate(rows_in_big_table):
xOne_item = Xbox360_MS_Site_Item()
gameName = onerow.xpath('h2/a/text()').extract()[0].strip()
gameName = re.sub(r'[^\x00-\x7F]+', '', gameName)
gameUrl = baseURL + onerow.xpath('h2/a/@href').extract()[0] + '?PageSize=60&Page=1&SortBy=BestSelling'
dayRecorded = time.strftime("%x")
xOne_item['gameName'] = gameName
xOne_item['gamesOnDemandorArcade'] = response.xpath('//*[@id="BodyContent"]/div[1]/h1/text()').extract()[0]
xOne_item['gameUrl'] = gameUrl
xOne_item['dayRecorded'] = dayRecorded
if gameUrl:
yield scrapy.Request(url=(gameUrl), callback=self.scrapeIndividualGames, meta={'xOne_item': xOne_item})
def scrapeIndividualGames(self, response):
xOne_item = response.meta['xOne_item']
DLlist = response.xpath('//*[@id="navDownloadType"]/li/a/text()').extract()
gameCount = ""
DLdemos = ""
DLgameVideos = ""
DLavatarItems = ""
DLthemes = ""
DLgamerPictures = ""
DLgameAddons = ""
DLsmartglass = ""
gameNameLong = ""
for phrase in DLlist:
if 'Games ' in phrase:
gameCount = int(re.findall('[0-9.]+',phrase)[0])
if 'Game Demos' in phrase:
DLdemos = re.findall('[0-9.]+',phrase)[0]
elif 'Game Videos' in phrase:
DLgameVideos = re.findall('[0-9.]+',phrase)[0]
elif 'Game Add-ons' in phrase:
DLgameAddons = re.findall('[0-9.]+',phrase)[0]
elif 'Themes' in phrase:
DLthemes = re.findall('[0-9.]+',phrase)[0]
elif 'Gamer Pictures' in phrase:
DLgamerPictures = re.findall('[0-9.]+',phrase)[0]
elif 'Avatar Items' in phrase:
DLavatarItems = re.findall('[0-9.]+',phrase)[0]
elif 'Xbox SmartGlass' in phrase:
DLsmartglass = re.findall('[0-9.]+',phrase)[0]
if gameCount > 0:
priceGold = response.xpath('//*[@id="LiveZone"]/div[2]/ol/li/div/div[2]/span/span[1]/text()').extract()
if len(priceGold) != 0:
priceGold = priceGold[0].strip().lstrip("$")
gameNameLong = response.xpath('//*[@id="LiveZone"]/div[2]/ol/li/div/div[1]/h2/text()').extract()
if len(gameNameLong) != 0:
gameNameLong = gameNameLong[0].strip()
gameNameLong = re.sub("Full Game - ","",gameNameLong)
if len(xOne_item['gameName']) < len(gameNameLong):
xOne_item['gameName'] = gameNameLong
if len(gameNameLong) == 0:
gameNameLong = response.xpath('//*[@id="LiveZone"]/div[2]/ol/li/div/div/h2/text()').extract()
gameNameLong = map(str.strip, map(str, gameNameLong))
if 'E3 2' in xOne_item['gameName'] or 'trial game' in xOne_item['gameName'].lower() or ' pics' in xOne_item['gameName'].lower() or ' theme' in xOne_item['gameName'].lower():
return
ProductPublishing = response.xpath('//*[@id="ProductPublishing"]')
Overview1 = response.xpath('//*[@id="overview1"]')
Overview2 = response.xpath('//*[@id="overview2"]')
ProductTitleZone = response.xpath('//*[@id="ProductTitleZone"]')
ProductPublishingCount = 1
releaseDate = ProductPublishing.xpath('li[' + str(ProductPublishingCount) + ']/text()').extract()
if len(releaseDate) != 0:
releaseDate = releaseDate[0].strip()
if releaseDate.replace("/", "").isdigit() == False:
releaseDate = ""
else:
ProductPublishingCount = ProductPublishingCount + 1
developer = ProductPublishing.xpath('li[' + str(ProductPublishingCount) + ']/text()').extract()
if len(developer) != 0:
developer = developer[0].strip()
ProductPublishingCount = ProductPublishingCount + 1
publisher = ProductPublishing.xpath('li[' + str(ProductPublishingCount) + ']/text()').extract()
if len(publisher) != 0:
publisher = publisher[0].strip()
ProductPublishingCount = ProductPublishingCount + 1
genre = ProductPublishing.xpath('li[' + str(ProductPublishingCount) + ']/text()').extract()
if len(genre) != 0:
genre = genre[0].strip()
features = Overview2.xpath('div[2]/div/div[1]/ul').extract()
if len(features) != 0:
features = features[0]
onlineFeatures = response.xpath('div[2]/div/div[2]/ul').extract()
if len(onlineFeatures) != 0:
onlineFeatures = onlineFeatures[0]
price = response.xpath('//*[@id="GetProduct"]/a/span/span/text()').extract()
if len(price) != 0:
price = price[0].strip().lstrip("$")
if price == "Free":
price = 0
highresboxart = Overview1.xpath('div[1]/img/@src').extract()
if len(highresboxart) != 0:
highresboxart = highresboxart[0].strip()
ESRBRating = response.xpath('//*[@id="ActualRating"]/text()').extract()
for i in ESRBRating:
if len(i.strip()) != 0:
ESRBRating = i.strip()
xboxRatingStars = ProductTitleZone.xpath('div[2]/div/span/@class').extract()
xboxRating = 0
for start in xboxRatingStars:
xboxRating += float(re.findall('[0-9.]+', start)[0])/4
numberOfReviews = ProductTitleZone.xpath('div[2]/span/text()')
if len(numberOfReviews) != 0:
numberOfReviews = numberOfReviews.extract()[0].strip().strip(',')
xOne_item['gameCount'] = gameCount
xOne_item['developer'] = developer
xOne_item['publisher'] = publisher
xOne_item['genre'] = genre
xOne_item['features'] = features
xOne_item['onlineFeatures'] = onlineFeatures
xOne_item['price'] = price
xOne_item['priceGold'] = priceGold
xOne_item['highresboxart'] = highresboxart
xOne_item['ESRBRating'] = ESRBRating
xOne_item['xbox360Rating'] = xboxRating
xOne_item['releaseDate'] = releaseDate
xOne_item['numberOfReviews'] = numberOfReviews
xOne_item['DLsmartglass'] = DLsmartglass
xOne_item['DLavatarItems'] = DLavatarItems
xOne_item['DLdemos'] = DLdemos
xOne_item['DLgameVideos'] = DLgameVideos
xOne_item['DLgameAddons'] = DLgameAddons
xOne_item['DLthemes'] = DLthemes
xOne_item['DLgamerPictures'] = DLgamerPictures
yield xOne_item
class XboxOne_MS_Site(scrapy.Spider):
name = "XboxOne_MS_Site"
allowed_domains = ['microsoft.com']
start_urls = (
'https://www.microsoft.com/en-us/store/top-paid/games/xbox?s=store&skipitems=0',
)
def parse(self, response):
print "=" * 50
numberOfPages = response.xpath('//*[@id="productPlacementList"]/div/p[1]/small/text()').extract()[0]
numberOfPages = int(math.floor(float(re.findall("[0-9]+", numberOfPages)[-1])/90))
print "=" * 50
for j in range(0,numberOfPages+1):
next_page = 'https://www.microsoft.com/en-us/store/top-paid/games/xbox?s=store&skipitems=' + str(j*90)
print(next_page)
print("Page" + str(j))
yield scrapy.Request(next_page, callback=self.xbPageFind)
def xbPageFind(self, response):
baseURL = "https://www.microsoft.com"
rows_in_big_table = response.xpath('//*[@id="productPlacementList"]/div/div/section/a')
for i, onerow in enumerate(rows_in_big_table):
xOne_item = XboxOne_MS_Site_Item()
gameName = onerow.xpath('div/h3/text()').extract()[0]
gameUrl = onerow.xpath('@href')
if len(gameUrl) != 0:
gameUrl = gameUrl[0].extract().strip()
gameUrl = baseURL + gameUrl
price = onerow.xpath('div[2]/div[2]/span/text()')
if len(price) == 0:
price = onerow.xpath('div[2]/div[2]/span[1]/s/text()')
if len(price) != 0:
price = price[0].extract().strip()
priceGold = onerow.xpath('div[2]/div[2]/span[1]/span[2]/text()').extract()
if len(priceGold) > 0:
priceGold = priceGold[0]
today = datetime.date.today()
dayRecorded = time.strftime("%x")
xboxRating = onerow.xpath('div[2]/div[1]/p/span[1]/text()').extract()[0]
xOne_item['gameName'] = gameName
xOne_item['gameUrl'] = gameUrl
xOne_item['price'] = price
xOne_item['priceGold'] = priceGold
xOne_item['dayRecorded'] = dayRecorded
xOne_item['xboxRating'] = xboxRating
yield xOne_item
print "=" * 50
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.