Skip to content

Instantly share code, notes, and snippets.

View alexanderholt's full-sized avatar

alexanderholt

View GitHub Profile
@alexanderholt
alexanderholt / wikipedia_scrape_lists.py
Last active January 29, 2021 01:24
Helpful for scraping list pages on Wikipedia that you can then use to feed into the API to query specific pages
import wikipedia
import requests
from bs4 import BeautifulSoup
import time
import numpy as np
# first pull the HTML from the page that links to all of the pages with the links.
# in this case, this page gives the links list pages of sci-fi films by decade.
# just go to https://en.wikipedia.org/wiki/Lists_of_science_fiction_films
# to see what I'm pulling from.
@alexanderholt
alexanderholt / wikipedia_section_loop.py
Last active January 29, 2021 01:26
For when pages have different names for the same type of section.
import wikipedia
import numpy as np
# you'll need to get the exact names of the titles of the pages beforehand
example_titles =
['Algol (film)','Dr. Jekyll and Mr. Hyde (1920 Haydon film)',
'Figures of the Night', 'The Invisible Ray (1920 serial)', 'The Man from Beyond',
'Black Oxen','Aelita','The Hands of Orlac (1924 film)']
# create a list of all the names you think/know the section might be called
@alexanderholt
alexanderholt / wikipedia_section.py
Last active January 29, 2021 01:26
Pull specific section of Wikipedia page using wikipedia python package
# get the section of a page. In this case the Plot description of Metropolis
section = wikipedia.WikipediaPage('Metropolis (1927 film)').section('Plot')
# that will return fairly clean text, but the next line of code
# will help clean that up.
section = section.replace('\n','').replace("\'","")
@alexanderholt
alexanderholt / wikipedia_summary.py
Last active January 29, 2021 01:26
Get the Summary of a given Wikipedia Page
import wikipedia
print(wikipedia.WikipediaPage(title = 'Metropolis (1927 film)').summary)
[10 ** i for i in np.linspace(-3,3,15)]
# start at root directory
nano .bash_profile
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
tvec = TfidfVectorizer(max_features = 100,stop_words='english')
tvec.fit(X_train)
df_vec = pd.DataFrame(tvec.fit_transform(X_train).todense(),columns=tvec.get_feature_names())
df_vec_test = pd.DataFrame(tvec.transform(X_test).todense(),columns=tvec.get_feature_names())
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
X_scaled = StandardScaler().fit_transform(X)
dbscn = DBSCAN(eps = 4, min_samples = 5).fit(X_scaled)
labels = dbscn.labels_
print(labels)
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import PCA
pca = PCA()
lr = LinearRegression()
# make a pipeline that chains together the pca and the linear regression
# this means that when the X data gets "piped in" it first hits the PCA,
# which will fit it to the data, then transform the original variables
# into their principal component "new variables".
subjective = sd[subjective_cols]
## Remember, we need to center before PCA and should standardize.
subjective = (subjective - subjective.mean()) / subjective.std()