Last active
August 15, 2019 01:31
-
-
Save Scstechr/4b1e11f56ed51f181eebf77f7d57289c to your computer and use it in GitHub Desktop.
Search keywords via Hyper Collocation (https://hypcol.marutank.net/ja/)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
# What is this?: | |
- Hyper Collocation (by Ichiro Maruta) search from a console. | |
# Requirements: | |
- Python 3.x with bs4, lxml, selenium | |
- Chromedriver(https://chromedriver.chromium.org/downloads) in an executable path | |
# Usage: | |
bash: python main.py [search keyword] | |
# Acknowledgement: | |
Thanks to Ichiro Maruta for the Hyper Collocation service. | |
""" | |
import sys | |
from bs4 import BeautifulSoup | |
import lxml | |
from selenium.webdriver import Chrome, ChromeOptions | |
from selenium.webdriver.common.keys import Keys | |
def Driver(): | |
print("[2/6] SETTING UP DRIVER...") | |
options = ChromeOptions() | |
options.add_argument('--headless') | |
driver = Chrome(options=options) | |
return driver | |
def Search(driver, keyword): | |
print("[4/6] SEARCHING...") | |
input_element = driver.find_element_by_class_name("input") | |
input_element.send_keys(keyword) | |
input_element.send_keys(Keys.RETURN) | |
infohead = [] | |
i = 0 | |
while not len(infohead): | |
html = driver.page_source.encode('utf-8') | |
soup = BeautifulSoup(html, "lxml") | |
slist = soup.find_all("div", class_="bar-list-box tooltip is-tooltip-top") | |
infohead = soup.find_all('span', class_="infohead") | |
i += 1 | |
if(i > 1000): | |
print("TIME OUT") | |
sys.exit(1) | |
infohead = '\n' + infohead[0].text + '\n' | |
if not len(slist): | |
print(infohead) | |
sys.exit(1) | |
return infohead, slist | |
def Parse(slist): | |
print("[5/6] PARSING...") | |
wlist = [] | |
wMaxLen, oMaxLen = 0, 0 | |
for div in slist: | |
string = str(div).split('"')[3] | |
d = {} | |
# WORD | |
d['word'] = string.split(' — ')[0].replace("\n",' ')[:-1] | |
if len(d['word']) > wMaxLen: | |
wMaxLen = len(d['word']) | |
# OCCURRENCE | |
string = string.split(' — ')[1].replace('occurences', '') | |
d['occu'] = string.split(' ')[0] | |
if len(d['occu']) > oMaxLen: | |
oMaxLen = len(d['occu']) | |
# PERCENTAGE | |
d['perc'] = string.split('(')[1][:-1] | |
wlist.append(d) | |
wlist[0]['wMaxLen'] = wMaxLen | |
wlist[0]['oMaxLen'] = oMaxLen | |
return wlist | |
def Print(infohead, wlist): | |
print("[6/6] PRINTING OUT RESULT...") | |
print(infohead) | |
wMaxLen = wlist[0]['wMaxLen'] | |
oMaxLen = wlist[0]['oMaxLen'] | |
if len(wlist) > 10: | |
wlist = wlist[:11] | |
for d in wlist: | |
word = d['word'].ljust(wMaxLen+2) | |
occu = d['occu'].ljust(oMaxLen+2) | |
perc = d['perc'].rjust(6) | |
print(f' {word} | {occu} | {perc}') | |
print() | |
def main(): | |
keyword = ' '.join(sys.argv[1:]) | |
print("[1/6] SET KEYWORD:", keyword) | |
driver = Driver() | |
print("[3/6] ACCESSING...") | |
driver.get("https://hypcol.marutank.net/ja/") | |
infohead, slist = Search(driver, keyword) | |
Print(infohead, Parse(slist)) | |
main() |
Author
Scstechr
commented
Aug 15, 2019
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment