alexanderholt

## wikipedia_scrape_lists.py
import wikipedia
import requests
from bs4 import BeautifulSoup
import time
import numpy as np

# first pull the HTML from the page that links to all of the pages with the links.
# in this case, this page gives the links list pages of sci-fi films by decade.
# just go to https://en.wikipedia.org/wiki/Lists_of_science_fiction_films
# to see what I'm pulling from.

## wikipedia_section_loop.py
import wikipedia
import numpy as np

# you'll need to get the exact names of the titles of the pages beforehand
example_titles =
['Algol (film)','Dr. Jekyll and Mr. Hyde (1920 Haydon film)',
 'Figures of the Night', 'The Invisible Ray (1920 serial)', 'The Man from Beyond',
 'Black Oxen','Aelita','The Hands of Orlac (1924 film)']

# create a list of all the names you think/know the section might be called

## wikipedia_section.py
# get the section of a page. In this case the Plot description of Metropolis
section = wikipedia.WikipediaPage('Metropolis (1927 film)').section('Plot')

# that will return fairly clean text, but the next line of code
# will help clean that up.
section = section.replace('\n','').replace("\'","")

## wikipedia_summary.py
import wikipedia
print(wikipedia.WikipediaPage(title = 'Metropolis (1927 film)').summary)

## list_gridsearch.py
[10 ** i for i in np.linspace(-3,3,15)]

## shortcuts
# start at root directory
nano .bash_profile

## better_tfidf.py
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
tvec = TfidfVectorizer(max_features = 100,stop_words='english')
tvec.fit(X_train)
df_vec = pd.DataFrame(tvec.fit_transform(X_train).todense(),columns=tvec.get_feature_names())
df_vec_test = pd.DataFrame(tvec.transform(X_test).todense(),columns=tvec.get_feature_names())

## dbscan.py
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
from sklearn import metrics

X_scaled = StandardScaler().fit_transform(X)

dbscn = DBSCAN(eps = 4, min_samples = 5).fit(X_scaled)

labels = dbscn.labels_
print(labels)

## PCA_pipleline.py
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import PCA

pca = PCA()
lr = LinearRegression()

# make a pipeline that chains together the pca and the linear regression
# this means that when the X data gets "piped in" it first hits the PCA,
# which will fit it to the data, then transform the original variables
# into their principal component "new variables".

## standardize_old_school.py
subjective = sd[subjective_cols]

## Remember, we need to center before PCA and should standardize.
subjective = (subjective - subjective.mean()) / subjective.std()
	import wikipedia
	import requests
	from bs4 import BeautifulSoup
	import time
	import numpy as np

	# first pull the HTML from the page that links to all of the pages with the links.
	# in this case, this page gives the links list pages of sci-fi films by decade.
	# just go to https://en.wikipedia.org/wiki/Lists_of_science_fiction_films
	# to see what I'm pulling from.
	import wikipedia
	import numpy as np

	# you'll need to get the exact names of the titles of the pages beforehand
	example_titles =
	['Algol (film)','Dr. Jekyll and Mr. Hyde (1920 Haydon film)',
	'Figures of the Night', 'The Invisible Ray (1920 serial)', 'The Man from Beyond',
	'Black Oxen','Aelita','The Hands of Orlac (1924 film)']

	# create a list of all the names you think/know the section might be called
	# get the section of a page. In this case the Plot description of Metropolis
	section = wikipedia.WikipediaPage('Metropolis (1927 film)').section('Plot')

	# that will return fairly clean text, but the next line of code
	# will help clean that up.
	section = section.replace('\n','').replace("\'","")
	import wikipedia
	print(wikipedia.WikipediaPage(title = 'Metropolis (1927 film)').summary)
	from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
	tvec = TfidfVectorizer(max_features = 100,stop_words='english')
	tvec.fit(X_train)
	df_vec = pd.DataFrame(tvec.fit_transform(X_train).todense(),columns=tvec.get_feature_names())
	df_vec_test = pd.DataFrame(tvec.transform(X_test).todense(),columns=tvec.get_feature_names())
	from sklearn.cluster import DBSCAN
	from sklearn.preprocessing import StandardScaler
	from sklearn import metrics

	X_scaled = StandardScaler().fit_transform(X)

	dbscn = DBSCAN(eps = 4, min_samples = 5).fit(X_scaled)

	labels = dbscn.labels_
	print(labels)
	from sklearn.pipeline import make_pipeline
	from sklearn.decomposition import PCA

	pca = PCA()
	lr = LinearRegression()

	# make a pipeline that chains together the pca and the linear regression
	# this means that when the X data gets "piped in" it first hits the PCA,
	# which will fit it to the data, then transform the original variables
	# into their principal component "new variables".
	subjective = sd[subjective_cols]

	## Remember, we need to center before PCA and should standardize.
	subjective = (subjective - subjective.mean()) / subjective.std()