This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def modelfit(alg, dtrain, predictors,useTrainCV=True, cv_folds=5, early_stopping_rounds=50): | |
if useTrainCV: | |
xgb_param = alg.get_xgb_params() | |
xgtrain = xgb.DMatrix(dtrain[predictors].values, label=dtrain[target].values) | |
cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds, | |
metrics='auc', early_stopping_rounds=early_stopping_rounds, show_progress=False) | |
alg.set_params(n_estimators=cvresult.shape[0]) | |
#Fit the algorithm on the data | |
alg.fit(dtrain[predictors], dtrain['Disbursed'],eval_metric='auc') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sklearn.grid_search as gs | |
# Fit a random forest on the training set. | |
from sklearn import ensemble | |
rf = ensemble.RandomForestClassifier() | |
rf.fit(x_train, y_train) # fit | |
#print "- The training error is: %.5f" %(1-rf.score(x_train, y_train)) | |
#print "- The test error is: %.5f" %(1-rf.score(x_test, y_test)) | |
# Find the best parameters using grid_search.GridSearchCV to |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# (1) Fit a decision tree model on the training set with the default setting. | |
from sklearn import tree | |
import sklearn.grid_search as gs | |
tree_model = tree.DecisionTreeClassifier() | |
tree_model.fit(x_train, y_train) | |
grid_para_tree = {'criterion': ['gini', 'entropy'], 'max_depth': range(1, 11)} | |
grid_search_tree = gs.GridSearchCV(tree_model, grid_para_tree, cv=5, scoring='logloss') | |
grid_search_tree.fit(x_train, y_train) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
shinyApp( | |
ui = dashboardPage(header, sidebar, body, skin = "red"), # skin "red" | |
server <- server | |
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
server <- function(input, output) { | |
# ... | |
# ... Other functions are intentionally omitted for brevity ... | |
# ... | |
#//////////////////////////////////////////////////////////////////////////////// | |
# reactive function for comparison Map 1 and comparison table 1 | |
#//////////////////////////////////////////////////////////////////////////////// | |
updateInputDataForMapByJobTitle1 <- reactive({ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
header <- dashboardHeader( | |
title = "DS Salary Explorer" | |
) | |
sidebar <- dashboardSidebar( | |
sidebarMenu( | |
menuItem("Salary Scatter Plot", tabName = "myTabForScatterPlot", icon = icon("bar-chart-o")), | |
menuItem("Salary Data Explorer", tabName = "myTabForDataTable", icon = icon("fa fa-table")), | |
menuItem("Salary Comparison Map", tabName = "myTabForGvisMap", icon = icon("fa fa-map-marker")), | |
menuItem("Top Recruiters", tabName = "myTabForRecruitRanking", icon = icon("fa fa-list-ol")), |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Indeed_comp_complete_for_review.csv only contains companies having a company page in the indeed.com | |
# (omitted companies which do not have their indeed company pages. It is based on the 'indeed_companies.csv') | |
df_received_for_review = pd.read_csv('./data/Indeed_comp_complete_for_review.csv') | |
df_for_review = pd.DataFrame() # new data frame | |
comp_set = set() # for duplicate check | |
for i in range(0,len(df_received_for_review)): | |
target_comp_name = df_received_for_review.iloc[i]['comp_name'] | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Save the result to CSV | |
df_received.to_csv('./data/indeed_companies.csv', encoding='utf-8') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
df_received = df | |
for i in range(0,len(df_received)): # get all the company details ( | |
target_comp_name = df_received.iloc[i]['comp_name'] | |
url_2nd = df.iloc[i]['overall_link'] | |
if url_2nd != None: | |
target_2nd = Soup(urllib.urlopen(url_2nd), "lxml") | |
comp_logo = target_2nd.find("div", {"id": "cmp-header-logo"}).find('img') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
for page in range(1,101): # page from 1 to 100 (last page we can scrape is 100) | |
page = (page-1) * 10 | |
url = "%s%s%s%d" % (base_url, sort_by, start_from, page) # get full url | |
target = Soup(urllib.urlopen(url), "lxml") | |
targetElements = target.findAll('div', attrs={'class' : ' row result'}) # we're interested in each row (= each job) | |
# trying to get each specific job information (such as company name, job title, urls, ...) | |
for elem in targetElements: | |
comp_name = elem.find('span', attrs={'itemprop':'name'}).getText().strip() |
NewerOlder