akngs/wordfreq.md

## wordfreq.md

      
    Raw
  

              wordfreq.md
            
          
    How to run


Install Python 3
Install KoNLPy
Run python wordfreq.py --xpath './/div[@class="statement"]/div[@class="content glossary"]/text()' --url http://pokr.kr/meeting/1933823653/dialog


## wordfreq.py
import argparse
import csv
import sys
from collections import Counter
from urllib.request import urlopen

from konlpy.tag import Twitter
from lxml import html


def main():
    parser = argparse.ArgumentParser(
        description='Extract frequent words from URL')
    parser.add_argument('--url', help='URL to fetch')
    parser.add_argument('--xpath', help='XPath expression')
    args = parser.parse_args()

    text = fetch(args.url, args.xpath)

    words = extract_words(text)
    to_csv(sys.stdout, words)


def fetch(url, xpath):
    res = urlopen(url).read().decode('utf-8')
    page = html.fromstring(res)
    return '\n'.join(page.xpath(xpath))


def extract_words(text, max_n=500, min_freq=2):
    # Twitter analyzer is the fastest so far
    analyzer = Twitter()
    nouns = [n for n in analyzer.nouns(text) if len(n) > 2]
    count = Counter(nouns)

    return [
        {'word': n, 'freq': freq}
        for n, freq in count.most_common(max_n)
        if freq >= min_freq
    ]


def to_csv(stream, objs):
    w = csv.DictWriter(stream, objs[0].keys())
    w.writeheader()
    w.writerows(objs)


if __name__ == '__main__':
    main()
	import argparse
	import csv
	import sys
	from collections import Counter
	from urllib.request import urlopen

	from konlpy.tag import Twitter
	from lxml import html


	def main():
	parser = argparse.ArgumentParser(
	description='Extract frequent words from URL')
	parser.add_argument('--url', help='URL to fetch')
	parser.add_argument('--xpath', help='XPath expression')
	args = parser.parse_args()

	text = fetch(args.url, args.xpath)

	words = extract_words(text)
	to_csv(sys.stdout, words)


	def fetch(url, xpath):
	res = urlopen(url).read().decode('utf-8')
	page = html.fromstring(res)
	return '\n'.join(page.xpath(xpath))


	def extract_words(text, max_n=500, min_freq=2):
	# Twitter analyzer is the fastest so far
	analyzer = Twitter()
	nouns = [n for n in analyzer.nouns(text) if len(n) > 2]
	count = Counter(nouns)

	return [
	{'word': n, 'freq': freq}
	for n, freq in count.most_common(max_n)
	if freq >= min_freq
	]


	def to_csv(stream, objs):
	w = csv.DictWriter(stream, objs[0].keys())
	w.writeheader()
	w.writerows(objs)


	if __name__ == '__main__':
	main()