Skip to content

Instantly share code, notes, and snippets.

@Scstechr
Last active August 15, 2019 01:31
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Scstechr/4b1e11f56ed51f181eebf77f7d57289c to your computer and use it in GitHub Desktop.
Save Scstechr/4b1e11f56ed51f181eebf77f7d57289c to your computer and use it in GitHub Desktop.
Search keywords via Hyper Collocation (https://hypcol.marutank.net/ja/)
"""
# What is this?:
- Hyper Collocation (by Ichiro Maruta) search from a console.
# Requirements:
- Python 3.x with bs4, lxml, selenium
- Chromedriver(https://chromedriver.chromium.org/downloads) in an executable path
# Usage:
bash: python main.py [search keyword]
# Acknowledgement:
Thanks to Ichiro Maruta for the Hyper Collocation service.
"""
import sys
from bs4 import BeautifulSoup
import lxml
from selenium.webdriver import Chrome, ChromeOptions
from selenium.webdriver.common.keys import Keys
def Driver():
print("[2/6] SETTING UP DRIVER...")
options = ChromeOptions()
options.add_argument('--headless')
driver = Chrome(options=options)
return driver
def Search(driver, keyword):
print("[4/6] SEARCHING...")
input_element = driver.find_element_by_class_name("input")
input_element.send_keys(keyword)
input_element.send_keys(Keys.RETURN)
infohead = []
i = 0
while not len(infohead):
html = driver.page_source.encode('utf-8')
soup = BeautifulSoup(html, "lxml")
slist = soup.find_all("div", class_="bar-list-box tooltip is-tooltip-top")
infohead = soup.find_all('span', class_="infohead")
i += 1
if(i > 1000):
print("TIME OUT")
sys.exit(1)
infohead = '\n' + infohead[0].text + '\n'
if not len(slist):
print(infohead)
sys.exit(1)
return infohead, slist
def Parse(slist):
print("[5/6] PARSING...")
wlist = []
wMaxLen, oMaxLen = 0, 0
for div in slist:
string = str(div).split('"')[3]
d = {}
# WORD
d['word'] = string.split(' — ')[0].replace("\n",' ')[:-1]
if len(d['word']) > wMaxLen:
wMaxLen = len(d['word'])
# OCCURRENCE
string = string.split(' — ')[1].replace('occurences', '')
d['occu'] = string.split(' ')[0]
if len(d['occu']) > oMaxLen:
oMaxLen = len(d['occu'])
# PERCENTAGE
d['perc'] = string.split('(')[1][:-1]
wlist.append(d)
wlist[0]['wMaxLen'] = wMaxLen
wlist[0]['oMaxLen'] = oMaxLen
return wlist
def Print(infohead, wlist):
print("[6/6] PRINTING OUT RESULT...")
print(infohead)
wMaxLen = wlist[0]['wMaxLen']
oMaxLen = wlist[0]['oMaxLen']
if len(wlist) > 10:
wlist = wlist[:11]
for d in wlist:
word = d['word'].ljust(wMaxLen+2)
occu = d['occu'].ljust(oMaxLen+2)
perc = d['perc'].rjust(6)
print(f' {word} | {occu} | {perc}')
print()
def main():
keyword = ' '.join(sys.argv[1:])
print("[1/6] SET KEYWORD:", keyword)
driver = Driver()
print("[3/6] ACCESSING...")
driver.get("https://hypcol.marutank.net/ja/")
infohead, slist = Search(driver, keyword)
Print(infohead, Parse(slist))
main()
@Scstechr
Copy link
Author

スクリーンショット 2019-08-15 10 31 11

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment