Skip to content

Instantly share code, notes, and snippets.

def modelfit(alg, dtrain, predictors,useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
if useTrainCV:
xgb_param = alg.get_xgb_params()
xgtrain = xgb.DMatrix(dtrain[predictors].values, label=dtrain[target].values)
cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
metrics='auc', early_stopping_rounds=early_stopping_rounds, show_progress=False)
alg.set_params(n_estimators=cvresult.shape[0])
#Fit the algorithm on the data
alg.fit(dtrain[predictors], dtrain['Disbursed'],eval_metric='auc')
import sklearn.grid_search as gs
# Fit a random forest on the training set.
from sklearn import ensemble
rf = ensemble.RandomForestClassifier()
rf.fit(x_train, y_train) # fit
#print "- The training error is: %.5f" %(1-rf.score(x_train, y_train))
#print "- The test error is: %.5f" %(1-rf.score(x_test, y_test))
# Find the best parameters using grid_search.GridSearchCV to
# (1) Fit a decision tree model on the training set with the default setting.
from sklearn import tree
import sklearn.grid_search as gs
tree_model = tree.DecisionTreeClassifier()
tree_model.fit(x_train, y_train)
grid_para_tree = {'criterion': ['gini', 'entropy'], 'max_depth': range(1, 11)}
grid_search_tree = gs.GridSearchCV(tree_model, grid_para_tree, cv=5, scoring='logloss')
grid_search_tree.fit(x_train, y_train)
shinyApp(
ui = dashboardPage(header, sidebar, body, skin = "red"), # skin "red"
server <- server
)
server <- function(input, output) {
# ...
# ... Other functions are intentionally omitted for brevity ...
# ...
#////////////////////////////////////////////////////////////////////////////////
# reactive function for comparison Map 1 and comparison table 1
#////////////////////////////////////////////////////////////////////////////////
updateInputDataForMapByJobTitle1 <- reactive({
header <- dashboardHeader(
title = "DS Salary Explorer"
)
sidebar <- dashboardSidebar(
sidebarMenu(
menuItem("Salary Scatter Plot", tabName = "myTabForScatterPlot", icon = icon("bar-chart-o")),
menuItem("Salary Data Explorer", tabName = "myTabForDataTable", icon = icon("fa fa-table")),
menuItem("Salary Comparison Map", tabName = "myTabForGvisMap", icon = icon("fa fa-map-marker")),
menuItem("Top Recruiters", tabName = "myTabForRecruitRanking", icon = icon("fa fa-list-ol")),
# Indeed_comp_complete_for_review.csv only contains companies having a company page in the indeed.com
# (omitted companies which do not have their indeed company pages. It is based on the 'indeed_companies.csv')
df_received_for_review = pd.read_csv('./data/Indeed_comp_complete_for_review.csv')
df_for_review = pd.DataFrame() # new data frame
comp_set = set() # for duplicate check
for i in range(0,len(df_received_for_review)):
target_comp_name = df_received_for_review.iloc[i]['comp_name']
@monspo1
monspo1 / proj3_webscrape4_save_result.txt
Created March 3, 2016 02:34
Save the result to CSV
# Save the result to CSV
df_received.to_csv('./data/indeed_companies.csv', encoding='utf-8')
df_received = df
for i in range(0,len(df_received)): # get all the company details (
target_comp_name = df_received.iloc[i]['comp_name']
url_2nd = df.iloc[i]['overall_link']
if url_2nd != None:
target_2nd = Soup(urllib.urlopen(url_2nd), "lxml")
comp_logo = target_2nd.find("div", {"id": "cmp-header-logo"}).find('img')
for page in range(1,101): # page from 1 to 100 (last page we can scrape is 100)
page = (page-1) * 10
url = "%s%s%s%d" % (base_url, sort_by, start_from, page) # get full url
target = Soup(urllib.urlopen(url), "lxml")
targetElements = target.findAll('div', attrs={'class' : ' row result'}) # we're interested in each row (= each job)
# trying to get each specific job information (such as company name, job title, urls, ...)
for elem in targetElements:
comp_name = elem.find('span', attrs={'itemprop':'name'}).getText().strip()