This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
  | import wikipedia | |
| import requests | |
| from bs4 import BeautifulSoup | |
| import time | |
| import numpy as np | |
| # first pull the HTML from the page that links to all of the pages with the links. | |
| # in this case, this page gives the links list pages of sci-fi films by decade. | |
| # just go to https://en.wikipedia.org/wiki/Lists_of_science_fiction_films | |
| # to see what I'm pulling from. | 
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
  | import wikipedia | |
| import numpy as np | |
| # you'll need to get the exact names of the titles of the pages beforehand | |
| example_titles = | |
| ['Algol (film)','Dr. Jekyll and Mr. Hyde (1920 Haydon film)', | |
| 'Figures of the Night', 'The Invisible Ray (1920 serial)', 'The Man from Beyond', | |
| 'Black Oxen','Aelita','The Hands of Orlac (1924 film)'] | |
| # create a list of all the names you think/know the section might be called | 
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
  | # get the section of a page. In this case the Plot description of Metropolis | |
| section = wikipedia.WikipediaPage('Metropolis (1927 film)').section('Plot') | |
| # that will return fairly clean text, but the next line of code | |
| # will help clean that up. | |
| section = section.replace('\n','').replace("\'","") | 
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
  | import wikipedia | |
| print(wikipedia.WikipediaPage(title = 'Metropolis (1927 film)').summary) | 
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
  | [10 ** i for i in np.linspace(-3,3,15)] | 
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
  | # start at root directory | |
| nano .bash_profile | 
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
  | from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer | |
| tvec = TfidfVectorizer(max_features = 100,stop_words='english') | |
| tvec.fit(X_train) | |
| df_vec = pd.DataFrame(tvec.fit_transform(X_train).todense(),columns=tvec.get_feature_names()) | |
| df_vec_test = pd.DataFrame(tvec.transform(X_test).todense(),columns=tvec.get_feature_names()) | 
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
  | from sklearn.cluster import DBSCAN | |
| from sklearn.preprocessing import StandardScaler | |
| from sklearn import metrics | |
| X_scaled = StandardScaler().fit_transform(X) | |
| dbscn = DBSCAN(eps = 4, min_samples = 5).fit(X_scaled) | |
| labels = dbscn.labels_ | |
| print(labels) | 
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
  | from sklearn.pipeline import make_pipeline | |
| from sklearn.decomposition import PCA | |
| pca = PCA() | |
| lr = LinearRegression() | |
| # make a pipeline that chains together the pca and the linear regression | |
| # this means that when the X data gets "piped in" it first hits the PCA, | |
| # which will fit it to the data, then transform the original variables | |
| # into their principal component "new variables". | 
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
  | subjective = sd[subjective_cols] | |
| ## Remember, we need to center before PCA and should standardize. | |
| subjective = (subjective - subjective.mean()) / subjective.std() | 
NewerOlder