Skip to content

Instantly share code, notes, and snippets.

@akngs
Last active November 26, 2017 08:56
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save akngs/8078b7493e29137697e127eb62db70b9 to your computer and use it in GitHub Desktop.
Save akngs/8078b7493e29137697e127eb62db70b9 to your computer and use it in GitHub Desktop.
Extract frequent words from URL

How to run

  1. Install Python 3
  2. Install KoNLPy
  3. Run python wordfreq.py --xpath './/div[@class="statement"]/div[@class="content glossary"]/text()' --url http://pokr.kr/meeting/1933823653/dialog
import argparse
import csv
import sys
from collections import Counter
from urllib.request import urlopen
from konlpy.tag import Twitter
from lxml import html
def main():
parser = argparse.ArgumentParser(
description='Extract frequent words from URL')
parser.add_argument('--url', help='URL to fetch')
parser.add_argument('--xpath', help='XPath expression')
args = parser.parse_args()
text = fetch(args.url, args.xpath)
words = extract_words(text)
to_csv(sys.stdout, words)
def fetch(url, xpath):
res = urlopen(url).read().decode('utf-8')
page = html.fromstring(res)
return '\n'.join(page.xpath(xpath))
def extract_words(text, max_n=500, min_freq=2):
# Twitter analyzer is the fastest so far
analyzer = Twitter()
nouns = [n for n in analyzer.nouns(text) if len(n) > 2]
count = Counter(nouns)
return [
{'word': n, 'freq': freq}
for n, freq in count.most_common(max_n)
if freq >= min_freq
]
def to_csv(stream, objs):
w = csv.DictWriter(stream, objs[0].keys())
w.writeheader()
w.writerows(objs)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment