Keenan Burke-Pitts Kiwibp

## Facebook-Marketplace-Selenium.py
#facebook marketplace
from selenium import webdriver
from time import sleep
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from pymongo import MongoClient

class App:

## craigslist-scrapy-spider.py
# -*- coding: utf-8 -*-
import scrapy
import sys

class CraigslistSpider(scrapy.Spider):
    name = 'craigslist'
    allowed_domains = ['asheville.craigslist.org']
    start_urls = ['https://asheville.craigslist.org/search/sss']

    def parse(self, response):

## letgo-scrapy-spider.py
# -*- coding: utf-8 -*-
import scrapy
import json
import requests
import re
from time import sleep
import sys

class LetgoSpider(scrapy.Spider):
    name = 'letgo'

## gboost_model_evaluation.py
# train the model on the training set
gboost.fit(X_train, y_train)

# make class predictions for the testing set
y_pred_class = gboost.predict(X_test)

# IMPORTANT: first argument is true values, second argument is predicted values
print(metrics.confusion_matrix(y_test, y_pred_class))

binary = np.array([[125, 14],

## cross_validation.py
logreg = LogisticRegression()
logreg_cv = LogisticRegressionCV()
rf = RandomForestClassifier()
gboost = GradientBoostingClassifier()
svm = SVC(probability=True)
knn = KNeighborsClassifier()
dt = DecisionTreeClassifier()

models = [logreg, logreg_cv, rf, gboost, svm, knn, dt]

## feature_selection.py
# Tree-based estimators can be used to compute feature importances, which in turn can be used to discard irrelevant features.
clf = RandomForestClassifier(n_estimators=50, max_features='sqrt')
clf = clf.fit(train, targets)

# Let's have a look at the importance of each feature.
features = pd.DataFrame()
features['feature'] = train.columns
features['importance'] = clf.feature_importances_

# Sorting values by feature importance.

## wordcloud.py
wordcloud = WordCloud(background_color='white', mode = "RGB", width = 2000, height=1000).generate(str(postings['name']))
plt.title("Craigslist Used Items Word Cloud")
plt.imshow(wordcloud)
plt.axis("off")
plt.show();

## popular-locations-subset.py
#Removing all locations with 2 or less items.
counts = non_mv.location.value_counts()
loc_gt2 = counts[counts > 2]

popular_locations = non_mv[non_mv.location.isin(loc_gt2.keys())]

plt.figure(figsize=(10,5))
sns.violinplot(x="location", y="price", data=popular_locations, scale="width", inner="stick")
plt.show();

## scrapy-items-and-spider-scripts.py

import scrapy


class CraigslistWebscrapingItem(scrapy.Item):
    name = scrapy.Field()
    price = scrapy.Field()
    location = scrapy.Field()
    date = scrapy.Field()


## motor-vehicles-subset.py
# vehicles are skewing boxplot too much; all rows at or above 1.8k appear to be motor vehicles.
motor_vehicles = postings.loc[postings.price >= 1800.0, :]

motor_vehicles.plot.bar('name', 'price', figsize=(9,9))
plt.ylabel("Price")
plt.xlabel("Vehicle")
plt.show();
	#facebook marketplace
	from selenium import webdriver
	from time import sleep
	from selenium.webdriver.common.keys import Keys
	from selenium.webdriver.support.ui import WebDriverWait
	from selenium.webdriver.common.by import By
	from selenium.webdriver.support import expected_conditions as EC
	from pymongo import MongoClient

	class App:
	# -- coding: utf-8 --
	import scrapy
	import sys

	class CraigslistSpider(scrapy.Spider):
	name = 'craigslist'
	allowed_domains = ['asheville.craigslist.org']
	start_urls = ['https://asheville.craigslist.org/search/sss']

	def parse(self, response):
	# -- coding: utf-8 --
	import scrapy
	import json
	import requests
	import re
	from time import sleep
	import sys

	class LetgoSpider(scrapy.Spider):
	name = 'letgo'
	# train the model on the training set
	gboost.fit(X_train, y_train)

	# make class predictions for the testing set
	y_pred_class = gboost.predict(X_test)

	# IMPORTANT: first argument is true values, second argument is predicted values
	print(metrics.confusion_matrix(y_test, y_pred_class))

	binary = np.array([[125, 14],
	logreg = LogisticRegression()
	logreg_cv = LogisticRegressionCV()
	rf = RandomForestClassifier()
	gboost = GradientBoostingClassifier()
	svm = SVC(probability=True)
	knn = KNeighborsClassifier()
	dt = DecisionTreeClassifier()

	models = [logreg, logreg_cv, rf, gboost, svm, knn, dt]
	# Tree-based estimators can be used to compute feature importances, which in turn can be used to discard irrelevant features.
	clf = RandomForestClassifier(n_estimators=50, max_features='sqrt')
	clf = clf.fit(train, targets)

	# Let's have a look at the importance of each feature.
	features = pd.DataFrame()
	features['feature'] = train.columns
	features['importance'] = clf.feature_importances_

	# Sorting values by feature importance.
	wordcloud = WordCloud(background_color='white', mode = "RGB", width = 2000, height=1000).generate(str(postings['name']))
	plt.title("Craigslist Used Items Word Cloud")
	plt.imshow(wordcloud)
	plt.axis("off")
	plt.show();
	#Removing all locations with 2 or less items.
	counts = non_mv.location.value_counts()
	loc_gt2 = counts[counts > 2]

	popular_locations = non_mv[non_mv.location.isin(loc_gt2.keys())]

	plt.figure(figsize=(10,5))
	sns.violinplot(x="location", y="price", data=popular_locations, scale="width", inner="stick")
	plt.show();

	import scrapy


	class CraigslistWebscrapingItem(scrapy.Item):
	name = scrapy.Field()
	price = scrapy.Field()
	location = scrapy.Field()
	date = scrapy.Field()
	# vehicles are skewing boxplot too much; all rows at or above 1.8k appear to be motor vehicles.
	motor_vehicles = postings.loc[postings.price >= 1800.0, :]

	motor_vehicles.plot.bar('name', 'price', figsize=(9,9))
	plt.ylabel("Price")
	plt.xlabel("Vehicle")
	plt.show();