Skip to content

Instantly share code, notes, and snippets.

@copyninja
Created October 15, 2010 11:14
Show Gist options
  • Save copyninja/628023 to your computer and use it in GitHub Desktop.
Save copyninja/628023 to your computer and use it in GitHub Desktop.
Extracts Kannada language words from the Webpages
#!/usr/local/bin/python3
import re
from urllib.request import urlopen, Request
from urllib.error import HTTPError
from multiprocessing import Process
def get_page(urlstring):
"""
Arguments:
- `url`:
"""
request = Request(url=urlstring)
request.add_header("User-Agent","Mozilla 5.0")
page_content = ""
try:
response = urlopen(request)
page_content = response.read()
except HTTPError as e:
print(("Something went wrong {}".format(e.read())))
return page_content.decode("utf-8")
def find_kannada_words(content):
expression = re.compile("([\u0c80-\u0cf2]+)[^\u0c80-\u0cf2]*")
kannada_words = expression.findall(content)
with open("Words.txt","a+") as infile:
for word in kannada_words:
infile.write(word+"\n")
def find_urls(content):
"""
Arguments:
- `content`:
"""
urls = re.compile('<a href="(http://\w*\d?\.?\w+\.\w+\S*).*">')
URLs = urls.findall(content)
with open("URL.txt","a+") as urlfile:
for url in URLs:
urlfile.write(url+"\n")
if __name__ == "__main__":
content = get_page("http://kn.wikipedia.org/wiki/Karnataka")
fp = open("content.txt","w")
fp.write(content)
fp.close()
p1 = Process(target=find_kannada_words,args=(content,))
p2 = Process(target=find_urls,args=(content,))
p1.start()
p2.start()
p1.join()
p2.join()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment