Scstechr/HyperCollocationSearch.py

## HyperCollocationSearch.py
"""
 # What is this?:
    - Hyper Collocation (by Ichiro Maruta) search from a console.
 # Requirements:
    - Python 3.x with bs4, lxml, selenium
    - Chromedriver(https://chromedriver.chromium.org/downloads) in an executable path
 # Usage:
    bash: python main.py [search keyword]
 # Acknowledgement:
  Thanks to Ichiro Maruta for the Hyper Collocation service.
"""

import sys
from bs4 import BeautifulSoup
import lxml
from selenium.webdriver import Chrome, ChromeOptions
from selenium.webdriver.common.keys import Keys

def Driver():
    print("[2/6] SETTING UP DRIVER...")
    options = ChromeOptions()
    options.add_argument('--headless')
    driver = Chrome(options=options)
    return driver

def Search(driver, keyword):
    print("[4/6] SEARCHING...")
    input_element = driver.find_element_by_class_name("input")
    input_element.send_keys(keyword)
    input_element.send_keys(Keys.RETURN)
    infohead = []
    i = 0
    while not len(infohead):
        html = driver.page_source.encode('utf-8')
        soup = BeautifulSoup(html, "lxml")
        slist = soup.find_all("div", class_="bar-list-box tooltip is-tooltip-top")
        infohead = soup.find_all('span', class_="infohead")
        i += 1
        if(i > 1000):
            print("TIME OUT")
            sys.exit(1)

    infohead = '\n' + infohead[0].text + '\n'
    if not len(slist):
        print(infohead)
        sys.exit(1)
    return infohead, slist

def Parse(slist):
    print("[5/6] PARSING...")
    wlist = []
    wMaxLen, oMaxLen = 0, 0
    for div in slist:
        string = str(div).split('"')[3]

        d = {}
        # WORD
        d['word'] = string.split(' — ')[0].replace("\n",' ')[:-1]
        if len(d['word']) > wMaxLen:
            wMaxLen = len(d['word'])
        # OCCURRENCE
        string = string.split(' — ')[1].replace('occurences', '')
        d['occu'] = string.split('  ')[0]
        if len(d['occu']) > oMaxLen:
            oMaxLen = len(d['occu'])
        # PERCENTAGE
        d['perc'] = string.split('(')[1][:-1]
        wlist.append(d)
    wlist[0]['wMaxLen'] = wMaxLen
    wlist[0]['oMaxLen'] = oMaxLen
    return wlist

def Print(infohead, wlist):
    print("[6/6] PRINTING OUT RESULT...")
    print(infohead)
    wMaxLen = wlist[0]['wMaxLen']
    oMaxLen = wlist[0]['oMaxLen']
    if len(wlist) > 10:
        wlist = wlist[:11]
    for d in wlist:
       word = d['word'].ljust(wMaxLen+2)
       occu = d['occu'].ljust(oMaxLen+2)
       perc = d['perc'].rjust(6)
       print(f'  {word} | {occu} | {perc}')
    print()

def main():
    keyword = ' '.join(sys.argv[1:])
    print("[1/6] SET KEYWORD:", keyword)
    driver = Driver()
    print("[3/6] ACCESSING...")
    driver.get("https://hypcol.marutank.net/ja/")
    infohead, slist = Search(driver, keyword)
    Print(infohead, Parse(slist))

main()
	"""
	# What is this?:
	- Hyper Collocation (by Ichiro Maruta) search from a console.
	# Requirements:
	- Python 3.x with bs4, lxml, selenium
	- Chromedriver(https://chromedriver.chromium.org/downloads) in an executable path
	# Usage:
	bash: python main.py [search keyword]
	# Acknowledgement:
	Thanks to Ichiro Maruta for the Hyper Collocation service.
	"""

	import sys
	from bs4 import BeautifulSoup
	import lxml
	from selenium.webdriver import Chrome, ChromeOptions
	from selenium.webdriver.common.keys import Keys

	def Driver():
	print("[2/6] SETTING UP DRIVER...")
	options = ChromeOptions()
	options.add_argument('--headless')
	driver = Chrome(options=options)
	return driver

	def Search(driver, keyword):
	print("[4/6] SEARCHING...")
	input_element = driver.find_element_by_class_name("input")
	input_element.send_keys(keyword)
	input_element.send_keys(Keys.RETURN)
	infohead = []
	i = 0
	while not len(infohead):
	html = driver.page_source.encode('utf-8')
	soup = BeautifulSoup(html, "lxml")
	slist = soup.find_all("div", class_="bar-list-box tooltip is-tooltip-top")
	infohead = soup.find_all('span', class_="infohead")
	i += 1
	if(i > 1000):
	print("TIME OUT")
	sys.exit(1)

	infohead = '\n' + infohead[0].text + '\n'
	if not len(slist):
	print(infohead)
	sys.exit(1)
	return infohead, slist

	def Parse(slist):
	print("[5/6] PARSING...")
	wlist = []
	wMaxLen, oMaxLen = 0, 0
	for div in slist:
	string = str(div).split('"')[3]

	d = {}
	# WORD
	d['word'] = string.split(' — ')[0].replace("\n",' ')[:-1]
	if len(d['word']) > wMaxLen:
	wMaxLen = len(d['word'])
	# OCCURRENCE
	string = string.split(' — ')[1].replace('occurences', '')
	d['occu'] = string.split(' ')[0]
	if len(d['occu']) > oMaxLen:
	oMaxLen = len(d['occu'])
	# PERCENTAGE
	d['perc'] = string.split('(')[1][:-1]
	wlist.append(d)
	wlist[0]['wMaxLen'] = wMaxLen
	wlist[0]['oMaxLen'] = oMaxLen
	return wlist

	def Print(infohead, wlist):
	print("[6/6] PRINTING OUT RESULT...")
	print(infohead)
	wMaxLen = wlist[0]['wMaxLen']
	oMaxLen = wlist[0]['oMaxLen']
	if len(wlist) > 10:
	wlist = wlist[:11]
	for d in wlist:
	word = d['word'].ljust(wMaxLen+2)
	occu = d['occu'].ljust(oMaxLen+2)
	perc = d['perc'].rjust(6)
	print(f' {word} \| {occu} \| {perc}')
	print()

	def main():
	keyword = ' '.join(sys.argv[1:])
	print("[1/6] SET KEYWORD:", keyword)
	driver = Driver()
	print("[3/6] ACCESSING...")
	driver.get("https://hypcol.marutank.net/ja/")
	infohead, slist = Search(driver, keyword)
	Print(infohead, Parse(slist))

	main()