Skip to content

Instantly share code, notes, and snippets.

@moaikim

moaikim/main.py Secret

Created November 27, 2020 23:33
Show Gist options
  • Save moaikim/2f09eb81aef433e9f46cea76cbe36a70 to your computer and use it in GitHub Desktop.
Save moaikim/2f09eb81aef433e9f46cea76cbe36a70 to your computer and use it in GitHub Desktop.
WordCloud, Related Keywords
import argparse
import time
import queue
import pprint
import wordcloud_manager
from search_manager import SearchManager
def search(search_manager, where, keyword):
result = []
if where == 'google':
result = search_manager.searchGoogleRelatedKeywords(keyword)
elif where == 'naver':
result = search_manager.searchNaverRelatedKeywords(keyword)
elif where == 'daum':
result = search_manager.searchDaumRelatedKeywords(keyword)
return result
def search_to_search(search_manager, where, query, max):
keywordDict = {}
where = where.lower()
count = 0
q = queue.Queue(max + 1000)
q.put(query)
while not q.empty():
if count == max:
break
else:
count += 1
keyword = q.get()
relatedKeywords = search(search_manager, where, query)
for relatedKeyword in relatedKeywords:
if relatedKeyword == query:
continue
elif relatedKeyword in keywordDict.keys():
keywordDict[relatedKeyword] += 1
else :
keywordDict[relatedKeyword] = 1
q.put(relatedKeyword)
return keywordDict
def main():
parser = argparse.ArgumentParser()
parser.add_argument('where', type=str, choices=['google','naver','daum'], help="Choose where to search")
parser.add_argument('keyword', type=str, help="Enter keywords to crawl")
parser.add_argument('max', type=int, help="How many times will you repeat it?")
args = parser.parse_args()
try:
search_manager = SearchManager()
result = search_to_search(search_manager, args.where, args.keyword, args.max)
pprint.pprint(result, depth=2)
wordcloud_manager.make(result)
#wordcloud_manager.make_alice(result)
while True:
time.sleep(1)
except KeyboardInterrupt:
search_manager.close()
if __name__=="__main__":
main()
from selenium.webdriver import Chrome
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as ec
class SearchManager():
def __init__(self):
self.driver = Chrome()
self.wait = WebDriverWait(self.driver, 10)
self.driver.maximize_window()
def _move_end(self):
self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
def _click(self, xpath):
try:
self.wait.until(ec.element_to_be_clickable((By.XPATH, xpath)))
self.driver.find_element_by_xpath(xpath).click()
return True
except:
return False
def _insert(self, xpath, value):
try:
self.wait.until(ec.visibility_of_all_elements_located((By.XPATH, xpath)))
self.driver.find_element_by_xpath(xpath).send_keys(value)
return True
except:
return False
def _read(self, xpath):
try:
self.wait.until(ec.visibility_of_all_elements_located((By.XPATH, xpath)))
text = self.driver.find_element_by_xpath(xpath).text
return text.split()
except:
return []
def _reads(self, xpath):
try:
self.wait.until(ec.visibility_of_all_elements_located((By.XPATH, xpath)))
elements = self.driver.find_elements_by_xpath(xpath)
texts = []
for element in elements:
texts.append(element.text)
return texts
except:
return []
def start(self, url):
self.driver.get(url)
def close(self):
self.driver.close()
def searchGoogleRelatedKeywords(self, query):
self.start('https://www.google.com')
if not self._insert('//*[@id="tsf"]/div[2]/div[1]/div[1]/div/div[2]/input', query):
return []
if not self._click('//*[@id="tsf"]/div[2]/div[1]/div[3]/center/input[1]'):
return []
self._move_end()
keywords = self._reads('//*[@id="brs"]/g-section-with-header/div[2]/div/p[@class="nVcaUb"]')
return keywords
def searchNaverRelatedKeywords(self, query):
self.start('https://www.naver.com')
if not self._insert('//*[@id="query"]', query):
return []
if not self._click('//*[@id="search_btn"]'):
return []
self._move_end()
keywords = self._reads('//*[@id="nx_footer_related_keywords"]/div/div[2]/ul/li[@class="item"]/a/div[@class="tit"]')
return keywords
def searchDaumRelatedKeywords(self, query):
self.start('https://www.daum.net')
if not self._insert('//*[@id="q"]', query):
return []
if not self._click('//*[@id="daumSearch"]/fieldset/div/div/button[2]'):
return []
self._click('//*[@id="netizen_more_btn_bottom"]')
keywords = self._reads('//*[@id="netizen_lists_bottom"]/span[@class="wsn"]')
return keywords
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import numpy as np
from PIL import Image
WIDTH = 800
HEIGHT = 800
FONT= './NanumGothic.ttf'
BACKGROUND= 'white'
def setSize(width, height):
HEIGHT = width
HEIGHT = height
def make(data):
wordcloud = WordCloud(
width = WIDTH,
height = HEIGHT,
font_path= FONT,
background_color=BACKGROUND
)
wordcloud = wordcloud.generate_from_frequencies(data)
plt.figure(figsize=(10, 10))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()
def make_alice(data):
alice_mask = np.array(Image.open("./alice_mask.png"))
wordcloud = WordCloud(
width = WIDTH,
height = HEIGHT,
font_path= FONT,
background_color=BACKGROUND,
mask = alice_mask
)
wordcloud = wordcloud.generate_from_frequencies(data)
plt.figure(figsize=(10, 10))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment