Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save malaikannan/8a8b82fa9f45cb06b384a44eb3ad1a73 to your computer and use it in GitHub Desktop.
Save malaikannan/8a8b82fa9f45cb06b384a44eb3ad1a73 to your computer and use it in GitHub Desktop.
import requests
from bs4 import BeautifulSoup
import tamil
import os
from bloomfilter import BloomFilter
# URL for Project Madurai
base_url = 'https://www.projectmadurai.org'
bloom_filter_file_path = "tamil_bloom_filter.txt"
home_page = base_url + "/pmworks.html"
res = requests.get(url)
html_page = res.content
#Beautiful soup to parse HTML file and extract indiviual html links for documents
soup = BeautifulSoup(html_page)
links = [a['href'] for a in soup.select('a[href]')]
html_links = []
#link in
for link in links:
if ".html" in link:
if "pm_etext" in link:
html_links.append(link)
ta_words = []
for link in html_links:
res = requests.get(base_url+link)
html_page = res.content
soup = BeautifulSoup(html_page, 'html.parser')
text = soup.find_all(text=True)
output = ''
blacklist = [
'[document]',
'noscript',
'header',
'html',
'meta',
'head',
'input',
'script',
# there may be more elements you don't want, such as "style", etc.
]
for t in text:
if t.parent.name not in blacklist:
output += '{} '.format(t)
taletters = tamil.utf8.get_letters(output)
ta_words_page = tamil.utf8.get_tamil_words(taletters)
ta_words = ta_words + ta_words_page
ta_words_unique = list(set(ta_words))
outfile = open('tamilwordlist.txt', 'w') # open a file in write mode
for item in ta_words_unique: # iterate over the list items
outfile.write(str(item) + '\n') # write to the file
outfile.close() # close the file
def create_bloomfilter_file(ta_words_list):
items_count = len(ta_words_list)
falsepositive_probability = 0.001
bloomf = BloomFilter(items_count,falsepositive_probability)
for word in ta_words_list:
bloomf.add(word)
bloomf.writetofile(bloom_filter_file_path)
bloom_filter_file_path = "tamil_bloom_filter.txt"
create_bloomfilter_file(ta_words_unique)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment