Skip to content

Instantly share code, notes, and snippets.

@monspo1
monspo1 / proj3_webscrape1.py
Last active August 25, 2019 01:12
Open the connection to indeed.com using BeautifulSoup
# load the library
from bs4 import BeautifulSoup as Soup
import urllib, requests, re, pandas as pd
# indeed.com url
base_url = 'http://www.indeed.com/jobs?q=data+scientist&jt=fulltime&sort='
sort_by = 'date' # sort by data
start_from = '&start=' # start page number
pd.set_option('max_colwidth',500) # to remove column limit (Otherwise, we'll lose some info)
for page in range(1,101): # page from 1 to 100 (last page we can scrape is 100)
page = (page-1) * 10
url = "%s%s%s%d" % (base_url, sort_by, start_from, page) # get full url
target = Soup(urllib.urlopen(url), "lxml")
targetElements = target.findAll('div', attrs={'class' : ' row result'}) # we're interested in each row (= each job)
# trying to get each specific job information (such as company name, job title, urls, ...)
for elem in targetElements:
comp_name = elem.find('span', attrs={'itemprop':'name'}).getText().strip()
df_received = df
for i in range(0,len(df_received)): # get all the company details (
target_comp_name = df_received.iloc[i]['comp_name']
url_2nd = df.iloc[i]['overall_link']
if url_2nd != None:
target_2nd = Soup(urllib.urlopen(url_2nd), "lxml")
comp_logo = target_2nd.find("div", {"id": "cmp-header-logo"}).find('img')
@monspo1
monspo1 / proj3_webscrape4_save_result.txt
Created March 3, 2016 02:34
Save the result to CSV
# Save the result to CSV
df_received.to_csv('./data/indeed_companies.csv', encoding='utf-8')
# Indeed_comp_complete_for_review.csv only contains companies having a company page in the indeed.com
# (omitted companies which do not have their indeed company pages. It is based on the 'indeed_companies.csv')
df_received_for_review = pd.read_csv('./data/Indeed_comp_complete_for_review.csv')
df_for_review = pd.DataFrame() # new data frame
comp_set = set() # for duplicate check
for i in range(0,len(df_received_for_review)):
target_comp_name = df_received_for_review.iloc[i]['comp_name']
header <- dashboardHeader(
title = "DS Salary Explorer"
)
sidebar <- dashboardSidebar(
sidebarMenu(
menuItem("Salary Scatter Plot", tabName = "myTabForScatterPlot", icon = icon("bar-chart-o")),
menuItem("Salary Data Explorer", tabName = "myTabForDataTable", icon = icon("fa fa-table")),
menuItem("Salary Comparison Map", tabName = "myTabForGvisMap", icon = icon("fa fa-map-marker")),
menuItem("Top Recruiters", tabName = "myTabForRecruitRanking", icon = icon("fa fa-list-ol")),
server <- function(input, output) {
# ...
# ... Other functions are intentionally omitted for brevity ...
# ...
#////////////////////////////////////////////////////////////////////////////////
# reactive function for comparison Map 1 and comparison table 1
#////////////////////////////////////////////////////////////////////////////////
updateInputDataForMapByJobTitle1 <- reactive({
shinyApp(
ui = dashboardPage(header, sidebar, body, skin = "red"), # skin "red"
server <- server
)
# (1) Fit a decision tree model on the training set with the default setting.
from sklearn import tree
import sklearn.grid_search as gs
tree_model = tree.DecisionTreeClassifier()
tree_model.fit(x_train, y_train)
grid_para_tree = {'criterion': ['gini', 'entropy'], 'max_depth': range(1, 11)}
grid_search_tree = gs.GridSearchCV(tree_model, grid_para_tree, cv=5, scoring='logloss')
grid_search_tree.fit(x_train, y_train)
import sklearn.grid_search as gs
# Fit a random forest on the training set.
from sklearn import ensemble
rf = ensemble.RandomForestClassifier()
rf.fit(x_train, y_train) # fit
#print "- The training error is: %.5f" %(1-rf.score(x_train, y_train))
#print "- The test error is: %.5f" %(1-rf.score(x_test, y_test))
# Find the best parameters using grid_search.GridSearchCV to