Created
July 22, 2018 10:39
-
-
Save junhoyeo/f2f782aa34158ee3af6d6df01a7b1f1d to your computer and use it in GitHub Desktop.
extract korean keywords from the web with Python3
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from bs4 import BeautifulSoup | |
from krwordrank.hangle import normalize | |
from krwordrank.word import KRWordRank | |
from konlpy.tag import Hannanum | |
def parse(url): | |
print('\x1b[0;30;47mURL\x1b[0m : ' + url) | |
return keywords(requests.get(url).text) | |
def parse_all(articles): | |
keywords = [] | |
print('\x1b[5;30;44m' + '='*30 + '\x1b[0m') | |
for idx, url in enumerate(articles): | |
if not idx == 0: | |
print('\x1b[5;30;44m' + '='*30 + '\x1b[0m') | |
keywords.append(parse(url)) | |
print('\x1b[5;30;44m' + '='*30 + '\x1b[0m') | |
return keywords | |
def getsoup(html): | |
soup = BeautifulSoup(html, 'html.parser') | |
print('\x1b[0;30;47mTITLE\x1b[0m : ' + soup.title.text) | |
return soup | |
def clean(soup): | |
soup = soup.find('body') | |
[s.decompose() for s in soup(['script', 'style'])] | |
return ' '.join(soup.stripped_strings) | |
def keywords(html): | |
texts = ' '.join(clean(getsoup(html)).split()).split('.') | |
texts = [normalize(text, english=True, number=True) for text in texts] | |
wordrank_extractor = KRWordRank(min_count = 5, max_length = 10, verbose = True) | |
beta = 0.85 | |
max_iter = 10 | |
keywords, rank, graph = wordrank_extractor.extract(texts, beta, max_iter) | |
result = ' '.join([word[0] for word in sorted(keywords.items(), key=lambda x:x[1], reverse=True)[:10]]) | |
result = Hannanum().nouns(result) | |
print('\x1b[5;30;42mKEYWORDS\x1b[0m : ' + ' '.join(result)) | |
return result | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment