Skip to content

Instantly share code, notes, and snippets.

@junhoyeo
Created July 22, 2018 10:39
Show Gist options
  • Save junhoyeo/f2f782aa34158ee3af6d6df01a7b1f1d to your computer and use it in GitHub Desktop.
Save junhoyeo/f2f782aa34158ee3af6d6df01a7b1f1d to your computer and use it in GitHub Desktop.
extract korean keywords from the web with Python3
import requests
from bs4 import BeautifulSoup
from krwordrank.hangle import normalize
from krwordrank.word import KRWordRank
from konlpy.tag import Hannanum
def parse(url):
print('\x1b[0;30;47mURL\x1b[0m : ' + url)
return keywords(requests.get(url).text)
def parse_all(articles):
keywords = []
print('\x1b[5;30;44m' + '='*30 + '\x1b[0m')
for idx, url in enumerate(articles):
if not idx == 0:
print('\x1b[5;30;44m' + '='*30 + '\x1b[0m')
keywords.append(parse(url))
print('\x1b[5;30;44m' + '='*30 + '\x1b[0m')
return keywords
def getsoup(html):
soup = BeautifulSoup(html, 'html.parser')
print('\x1b[0;30;47mTITLE\x1b[0m : ' + soup.title.text)
return soup
def clean(soup):
soup = soup.find('body')
[s.decompose() for s in soup(['script', 'style'])]
return ' '.join(soup.stripped_strings)
def keywords(html):
texts = ' '.join(clean(getsoup(html)).split()).split('.')
texts = [normalize(text, english=True, number=True) for text in texts]
wordrank_extractor = KRWordRank(min_count = 5, max_length = 10, verbose = True)
beta = 0.85
max_iter = 10
keywords, rank, graph = wordrank_extractor.extract(texts, beta, max_iter)
result = ' '.join([word[0] for word in sorted(keywords.items(), key=lambda x:x[1], reverse=True)[:10]])
result = Hannanum().nouns(result)
print('\x1b[5;30;42mKEYWORDS\x1b[0m : ' + ' '.join(result))
return result
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment