Skip to content

Instantly share code, notes, and snippets.

@kamathln
Forked from copyninja/kannada_word_extractor.py
Created October 15, 2010 13:24
Show Gist options
  • Save kamathln/628173 to your computer and use it in GitHub Desktop.
Save kamathln/628173 to your computer and use it in GitHub Desktop.
#!/usr/local/bin/python3
import re
from urllib.request import urlopen, Request
from urllib.error import HTTPError
def get_page(urlstring):
"""
Arguments:
- `url`:
"""
request = Request(url=urlstring)
request.add_header("User-Agent","Mozilla 5.0")
page_content = ""
try:
response = urlopen(request)
page_content = response.read()
except HTTPError as e:
print(("Something went wrong {}".format(e.read())))
return page_content.decode("utf-8")
def find_kannada_words(content):
expression = re.compile("([\u0c80-\u0cf2]+)[^\u0c80-\u0cf2]*")
words = expression.findall(content)
## Single char stuff should be removed by inspection
## To remove duplicates,
## $ sort -u Words.txt > Words2.txt
# print(words)
return words
if __name__ == "__main__":
# content = get_page("http://sampada.net/blog/omshivaprakash/07/10/2010/28378")
content = get_page("http://kn.wikipedia.org/wiki/Karnataka")
kannada_words = find_kannada_words(content)
fp = open("content.txt","w")
fp.write(content)
fp.close()
with open("Words.txt","w") as infile:
for word in kannada_words:
infile.write(word+"\n")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment