Skip to content

Instantly share code, notes, and snippets.

View kangjin2014's full-sized avatar
:electron:
Focusing

Ryan J Kang kangjin2014

:electron:
Focusing
  • Toronto, Canada
View GitHub Profile
import math.abs
object Playground {
def main(args: Array[String]) {
val tolerance = 0.001
def isCloseEnough(x: Double, y: Double) =
abs((x - y) / x) / x < tolerance
class LoadFiles(object):
def __int__(self):
self.path_to_dict = 'data/key_skill.csv'
self.path_to_jobs = 'data/ds.csv'
self.path_to_resume = 'data/resume_ryan_kang.docx'
def load_skills_dict(self, path_to_dict):
df_skills = pd.read_csv(path_to_dict, encoding='latin1', header= None)
return df_skills
## __init__.py
def init():
from <folder> import <file>
from . import <file> # "." means current folder/module
a = func_0()
b = funk_1()
## __main__.py
if __name__ == '__main__':
@kangjin2014
kangjin2014 / fulltext_html_parser.py
Created December 30, 2017 18:56
fulltext cleaning
from bs4 import BeautifulSoup
import urllib
def parser_job_link(job_link):
try:
html = urllib.request.urlopen(urllib.request.Request(job_link, headers={'User-Agent': 'Mozilla/5.0'})).read()
soup = BeautifulSoup(html, "html.parser")
for script in soup(["script", "style"]):
script.extract()
text = soup.get_text()
from sklearn.cluster import KMeans
from sklearn import metrics
from scipy.spatial.distance import cdist
import numpy as np
import matplotlib.pyplot as plt
# k means determine k
distortions = []
K = range(1,10)
for k in K:
@kangjin2014
kangjin2014 / pull.py
Created December 19, 2017 02:20
in case normal request doesn't work.
from urllib.request import Request, urlopen
link = 'www.google.com' # define the linnk
req = Request(link, headers={'User-Agent': 'Mozilla/5.0'})
webpage = urlopen(req).read()
@kangjin2014
kangjin2014 / pull.py
Created December 19, 2017 02:20
in case normal request doesn't work.
from urllib.request import Request, urlopen
link = 'www.google.com' # define the linnk
req = Request(link, headers={'User-Agent': 'Mozilla/5.0'})
webpage = urlopen(req).read()
@kangjin2014
kangjin2014 / web2pdf.py
Created December 15, 2017 05:43
I created this because of an enquiry about automate the webpage pulling and save the page as pdf with defined names. Download .py file and run it directly. Configuration instruction in the script.
import pandas as pd
import numpy as np
import pdfkit
# configure the pdf layout
options = {
'page-size': 'A4',
'margin-top': '0.75in',
'margin-right': '0.75in',
'margin-bottom': '0.75in',
@kangjin2014
kangjin2014 / web2page.py
Last active December 15, 2017 05:51
web2page.py
from selenium import webdriver
driver = webdriver.PhantomJS()
driver.maximize_window()
driver.get(link)
scheight = .1
while scheight < 9.9:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight/%s);" % scheight)
scheight += .01
@kangjin2014
kangjin2014 / scrapy_ryerson.py
Created December 15, 2017 00:53
scrapy_ryerson.pu
from scrapy.contrib.linkextractors import LinkExtractor
from scrapy.contrib.spiders import CrawlSpider, Rule
class DomainSpider(CrawlSpider):
name = 'prof'
allowed_domains = ['ryerson.ca']
start_urls = ['http://www.ryerson.ca/']
rules = (
Rule(LinkExtractor(allow=r"graduate/"), callback='parse_item', follow=True),
)