Keenan Burke-Pitts Kiwibp

## BSsnippet.py
guards_advanced = urllib.request.urlopen("https://rotogrinders.com/pages/nba-advanced-player-stats-guards-181885").read()
guards_advancedguards_  = bs.BeautifulSoup(guards_advanced, 'lxml')
#leaving out a number of lines necessary to extract data, see github repo for full code if you'd like.
guards_advanced_col_names = col_names.split()
print(guards_advanced_col_names)

#could also use pandas read_html method as well
guards_advanced_dfs = pd.read_html("https://rotogrinders.com/pages/nba-advanced-player-stats-guards-181885")
guards_advanced_stats_df = guards_advanced_dfs[2]
guards_advanced_stats_df.tail()

## Scrapy-pipelines-and-settings-adjustments.py
#pipeline adjustment to export data to MongoDB
from pymongo import MongoClient
from scrapy.conf import settings

class MongoDBPipeline(object):
    def __init__(self):
        connection = MongoClient(
            settings['MONGODB_SERVER'],
            settings['MONGODB_PORT'])


## letgo-scrapy-spider.py
# -*- coding: utf-8 -*-
import scrapy
import json
import requests
import re
from time import sleep
import sys

class LetgoSpider(scrapy.Spider):
    name = 'letgo'

## craigslist-scrapy-spider.py
# -*- coding: utf-8 -*-
import scrapy
import sys

class CraigslistSpider(scrapy.Spider):
    name = 'craigslist'
    allowed_domains = ['asheville.craigslist.org']
    start_urls = ['https://asheville.craigslist.org/search/sss']

    def parse(self, response):

## Facebook-Marketplace-Selenium.py
#facebook marketplace
from selenium import webdriver
from time import sleep
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from pymongo import MongoClient

class App:

## pandas-locatons-w-ten-or-more.py
locations_ten_or_more = all_items_df.groupby(['Location']).filter(lambda g: g.Location.value_counts() >= 10) \
.loc[:,['Location','Description', 'Price', 'Title', 'Url']]

#checking the number of locations with less than 10 items
len_of_locs = len(locations_ten_or_more.groupby("Location").size())
print(f'There are {len_of_locs} cities with 10 items or more.')
print('\n')

#checking the locations with the most items in this subset
print('Locations with the most amount of items in this subset:')

## monkey-learn-nlp.py
#execute Summary Extractor model
ml = MonkeyLearn('insert api key here')
data = list(nlp_df_sample.iloc[:,7])
model_id = 'ex_94WD2XxD'
summary_model_results = ml.extractors.extract(model_id, data, production_model=True)
print(summary_model_results.body)

#execute Price Extractor model
data = list(nlp_df_sample.iloc[:,7])
model_id = 'ex_wNDME4vE'

## scrapy-items-and-spider-scripts.py

import scrapy


class CraigslistWebscrapingItem(scrapy.Item):
    name = scrapy.Field()
    price = scrapy.Field()
    location = scrapy.Field()
    date = scrapy.Field()


## motor-vehicles-subset.py
# vehicles are skewing boxplot too much; all rows at or above 1.8k appear to be motor vehicles.
motor_vehicles = postings.loc[postings.price >= 1800.0, :]

motor_vehicles.plot.bar('name', 'price', figsize=(9,9))
plt.ylabel("Price")
plt.xlabel("Vehicle")
plt.show();

## popular-locations-subset.py
#Removing all locations with 2 or less items.
counts = non_mv.location.value_counts()
loc_gt2 = counts[counts > 2]

popular_locations = non_mv[non_mv.location.isin(loc_gt2.keys())]

plt.figure(figsize=(10,5))
sns.violinplot(x="location", y="price", data=popular_locations, scale="width", inner="stick")
plt.show();
	guards_advanced = urllib.request.urlopen("https://rotogrinders.com/pages/nba-advanced-player-stats-guards-181885").read()
	guards_advancedguards_ = bs.BeautifulSoup(guards_advanced, 'lxml')
	#leaving out a number of lines necessary to extract data, see github repo for full code if you'd like.
	guards_advanced_col_names = col_names.split()
	print(guards_advanced_col_names)

	#could also use pandas read_html method as well
	guards_advanced_dfs = pd.read_html("https://rotogrinders.com/pages/nba-advanced-player-stats-guards-181885")
	guards_advanced_stats_df = guards_advanced_dfs[2]
	guards_advanced_stats_df.tail()
	#pipeline adjustment to export data to MongoDB
	from pymongo import MongoClient
	from scrapy.conf import settings

	class MongoDBPipeline(object):
	def __init__(self):
	connection = MongoClient(
	settings['MONGODB_SERVER'],
	settings['MONGODB_PORT'])
	# -- coding: utf-8 --
	import scrapy
	import json
	import requests
	import re
	from time import sleep
	import sys

	class LetgoSpider(scrapy.Spider):
	name = 'letgo'
	#facebook marketplace
	from selenium import webdriver
	from time import sleep
	from selenium.webdriver.common.keys import Keys
	from selenium.webdriver.support.ui import WebDriverWait
	from selenium.webdriver.common.by import By
	from selenium.webdriver.support import expected_conditions as EC
	from pymongo import MongoClient

	class App:
	locations_ten_or_more = all_items_df.groupby(['Location']).filter(lambda g: g.Location.value_counts() >= 10) \
	.loc[:,['Location','Description', 'Price', 'Title', 'Url']]

	#checking the number of locations with less than 10 items
	len_of_locs = len(locations_ten_or_more.groupby("Location").size())
	print(f'There are {len_of_locs} cities with 10 items or more.')
	print('\n')

	#checking the locations with the most items in this subset
	print('Locations with the most amount of items in this subset:')
	#execute Summary Extractor model
	ml = MonkeyLearn('insert api key here')
	data = list(nlp_df_sample.iloc[:,7])
	model_id = 'ex_94WD2XxD'
	summary_model_results = ml.extractors.extract(model_id, data, production_model=True)
	print(summary_model_results.body)

	#execute Price Extractor model
	data = list(nlp_df_sample.iloc[:,7])
	model_id = 'ex_wNDME4vE'

	import scrapy


	class CraigslistWebscrapingItem(scrapy.Item):
	name = scrapy.Field()
	price = scrapy.Field()
	location = scrapy.Field()
	date = scrapy.Field()
	# vehicles are skewing boxplot too much; all rows at or above 1.8k appear to be motor vehicles.
	motor_vehicles = postings.loc[postings.price >= 1800.0, :]

	motor_vehicles.plot.bar('name', 'price', figsize=(9,9))
	plt.ylabel("Price")
	plt.xlabel("Vehicle")
	plt.show();
	#Removing all locations with 2 or less items.
	counts = non_mv.location.value_counts()
	loc_gt2 = counts[counts > 2]

	popular_locations = non_mv[non_mv.location.isin(loc_gt2.keys())]

	plt.figure(figsize=(10,5))
	sns.violinplot(x="location", y="price", data=popular_locations, scale="width", inner="stick")
	plt.show();