Skip to content

Instantly share code, notes, and snippets.

@junaidpv
Created January 8, 2011 09:38
Show Gist options
  • Save junaidpv/770718 to your computer and use it in GitHub Desktop.
Save junaidpv/770718 to your computer and use it in GitHub Desktop.
#!/usr/bin/python
# -*- coding: utf-8 -*-
from urllib.request import *
from html.parser import *
import re
class MyParser(HTMLParser):
hyperlinks = []
def handle_starttag(self, tag, attributes):
if tag=='a':
self.start_a(attributes)
def start_a(self, attributes):
for name, value in attributes:
if name=='href':
self.hyperlinks.append(value)
#web_page = urlopen('http://junaidpv.in')
#print(web_page.info().get('Content-Type'))
#my_parser = MyParser()
#my_parser.feed(str(web_page.read(), encoding='utf-8'))
#web_page.close()
#for link in my_parser.hyperlinks:
# print(link)
def f5(seq, idfun=None):
# order preserving
if idfun is None:
def idfun(x): return x
seen = {}
result = []
for item in seq:
marker = idfun(item)
# in old Python versions:
# if seen.has_key(marker)
# but in new ones:
if marker in seen: continue
seen[marker] = 1
result.append(item)
return result
i_file = open('input.txt', mode='r', encoding='utf-8')
o_file = open('output.txt', mode='w+', encoding='utf-8')
text = i_file.read()
i_file.close()
pattern = re.compile('[\u0D00-\u0D7F\u200C\u200D]+')
words = re.findall(pattern, text)
unique_words = f5(words)
print(len(words), " words.")
print(len(unique_words), " unique words.")
for line in unique_words:
o_file.writelines(line+"\n")
o_file.flush()
o_file.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment