Skip to content

Instantly share code, notes, and snippets.

@clepz
Last active July 29, 2023 15:06
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save clepz/70c6758bf58678ff14c756a5a29f4063 to your computer and use it in GitHub Desktop.
Save clepz/70c6758bf58678ff14c756a5a29f4063 to your computer and use it in GitHub Desktop.
# -*- coding: UTF-8 -*-
import sqlite3
from sqlite3 import Error
import urllib
import HTMLParser
import html2text
import re
from os import listdir
from os.path import isfile, join
from collections import Counter
indexList = []
indexListTr = []
lang_code = ''
inIgnorePrg = False
def create_connection(db_file):
""" create a database connection to a SQLite database """
conn = None
try:
conn = sqlite3.connect(db_file)
except Error as e:
print(e)
return conn
def insertP(conn, values):
sql = ''' INSERT INTO paragraphs (paragraph_name,section_id,paragraph_text_tr,paragraph_text_other) VALUES(?,?,?,?) '''
with conn:
cur = conn.cursor()
cur.execute(sql, values)
return cur.lastrowid
def insertBook(conn, values):
sql = ''' INSERT INTO books (book_name,lang_code) VALUES(?,?) '''
with conn:
cur = conn.cursor()
cur.execute(sql, values)
return cur.lastrowid
def insertSection(conn, values):
sql = ''' INSERT INTO booksSections (book_id,section_name) VALUES(?,?) '''
with conn:
cur = conn.cursor()
cur.execute(sql, values)
return cur.lastrowid
def insertIndex(conn, values, lang_code):
sql = ''' INSERT INTO indexList (lang_code,word, count) VALUES(?,?,?)'''
with conn:
cur = conn.cursor()
for value in sorted(values):
if (value != ''):
cur.execute(sql, (lang_code, value, values[value]))
def tireIsaretiDuzelt(value):
for x in range(0, len(value)):
if (value[x] != ''):
if (value[x][0] == '-'):
value[x] = value[x][1:-1]
if (value[x] == ''):
continue
if (value[x][-1] == '-'):
value[x] = value[x][0:-2]
if (value[x] == ''):
continue
class MyHTMLParser(HTMLParser.HTMLParser, object):
def __init__(self, conn):
super(MyHTMLParser, self).__init__()
self.yazdir = False
self.devam = False
self.yazdirOther = False
self.devamOther = False
self.conn = conn
self.prgSayi = 1
self.cumle = ""
self.baslik = False
self.baslikOther = False
self.attr = ""
self.cumleOther = ""
self.section_id = -1
def handle_starttag(self, tag, attrs):
# print("Found a start tag:", tag)
global inIgnorePrg
if (len(attrs) is not 0):
if (tag == "tr" and "IgnorePrg" in attrs[0][1] ):
inIgnorePrg = True
elif (tag == "tr" and "row-Paragraph" in attrs[0][1]):
inIgnorePrg = False
if (len(attrs) is not 0):
if (attrs[0][1] == "col-OTHER"):
self.devamOther = True
if (self.devamOther):
if (tag == "p"):
if inIgnorePrg:
baslik = False
for attr in attrs:
if "başlık" in attr[1].lower():
baslik = True
if not baslik:
self.devamOther = False
self.yazdirOther = False
return
self.yazdirOther = True
if (len(attrs) is not 0):
if (attrs[0][1] == "col-TR"):
self.devam = True
if (self.devam):
if (tag == "p"):
if inIgnorePrg:
baslik = False
for attr in attrs:
if "başlık" in attr[1].lower():
baslik = True
if not baslik:
self.devam = False
self.yazdir = False
self.yazdirOther = False
return
self.yazdir = True
self.yazdirOther = False
self.devamOther = False
if self.attr == '':
for attr in attrs:
if attr[0] == "name":
self.attr = attr[1]
if (tag == "a"):
for attr in attrs: # [ (class,value), (name,value) ]
if (attr[0] == "name"):
self.attr = attr[1]
self.baslik = True
# print self.attr
# diger dilden yazilari eklemek icin kullaniliyor.
def handle_endtag(self, tag):
if (tag == 'td' and self.cumle != ''):
cumleKontrol = self.cumle.strip()
self.cumle = self.cumle.replace("\n", " ")
self.cumle = re.sub(u' +', ' ', self.cumle)
self.cumle = re.sub(u'\\t+', ' ', self.cumle)
self.cumle = re.sub(u'[‘’‚„\'`´’¿¡]', '', self.cumle)
self.cumle = self.cumle.replace(u"I", u"ı")
self.cumle = self.cumle.lower()
self.cumle = self.cumle.replace(u'â', u'a').replace(u'û', u'u').replace(u'î', u'i')
self.cumleOther = self.cumleOther.replace("\n", " ")
self.cumleOther = re.sub(u' +', ' ', self.cumleOther)
self.cumleOther = re.sub(u'\\t+', ' ', self.cumleOther)
self.cumleOther = re.sub(u'[‘’‚„\'`´¿¡]', '', self.cumleOther)
self.cumleOther = self.cumleOther.lower()
if (u'\u2026' in cumleKontrol[-2:] or '*' in cumleKontrol[-2:] or ')' in cumleKontrol[-2:] or '.' in cumleKontrol[
-1] or '.' in cumleKontrol[-3:] or '!' in cumleKontrol[-3:] or ':' in cumleKontrol[-3:] or '?' in cumleKontrol[
-3:] or self.baslik):
global indexList
global indexListTr
global lang_code
if (lang_code == 'EN'):
indexListStringTr = nonLetterKarakterleriKaldir(self.cumle).strip()
indexListStringTr = arapcalariKaldir(indexListStringTr).strip()
indexListStringTr = indexListStringTr.split(' ')
tireIsaretiDuzelt(indexListStringTr)
indexListTr.extend(indexListStringTr)
indexStringOther = nonLetterKarakterleriKaldir(self.cumleOther).strip()
if (lang_code == "AR" or lang_code == "FA"):
indexStringOther = harekeleriKaldir(indexStringOther).strip()
self.cumleOther = harekeleriKaldir(self.cumleOther).strip()
else:
indexStringOther = arapcalariKaldir(indexStringOther).strip()
indexStringOther = indexStringOther.split(' ')
tireIsaretiDuzelt(indexStringOther)
indexList.extend(indexStringOther)
self.cumle = self.cumle.strip()
self.cumleOther = self.cumleOther.strip()
self.cumle = " " + self.cumle
self.cumleOther = " " + self.cumleOther
self.yazdir = False
self.devam = False
insertP(self.conn, (self.attr.decode(encoding="utf-8"), self.section_id, self.cumle, self.cumleOther))
self.yazdirOther = False
self.devamOther = False
# print self.cumleOther
self.cumleOther = ''
# print self.cumle
# print '--------------------------------'
self.cumle = ''
self.attr = ''
if self.baslik:
self.baslik = False
else:
self.devam = False
self.yazdir = False
self.devamOther = False
self.yazdirOther = False
def handle_data(self, data):
data = unicode(data, 'utf-8')
if (self.yazdir):
self.cumle += data
if (self.yazdirOther):
self.cumleOther += data
def sectionSonuEkleme(self):
if self.cumle != '':
self.yazdir = False
self.devam = False
self.yazdirOther = False
self.devamOther = False
# print self.cumleOther
insertP(self.conn, (self.attr.decode(encoding="utf-8"), self.section_id, self.cumle, self.cumleOther))
self.cumleOther = ''
# print self.cumle
# print '--------------------------------'
self.cumle = ''
self.attr = ''
if self.baslik:
self.baslik = False
if self.baslikOther:
self.baslikOther = False
def nonLetterKarakterleriKaldir(text):
text = re.sub(r'[,.)(«!:?/۞»;#◌\*\}\{\[\]\'\"]|[0-9]', '', text)
text = re.sub(u"[؛،”“…'ِ,﴾﴿ﷺ◌◌◌—]", "", text, flags=re.UNICODE)
return text
def arapcalariKaldir(text):
"""
test = unicode("شدَد", encoding='utf-8')
test = re.sub(u"[\u064e\u0634]", "", test, flags=re.UNICODE)
"""
text = re.sub(
u"[\u0600-\u06ff]|[\u0750-\u077f]|[\ufb50-\ufbc1]|[\ufbd3-\ufd3f]|[\ufd50-\ufd8f]|[\ufd92-\ufdc7]|[\ufe70-\ufefc]|[\uFDF0-\uFDFD]",
'', text, flags=re.UNICODE)
return text
def harekeleriKaldir(text):
text = text.encode('utf-8')
noise = re.compile(""" ّ | # Tashdid
َ | # Fatha
ً | # Tanwin Fath
ُ | # Damma
ٌ | # Tanwin Damm
ِ | # Kasra
ٍ | # Tanwin Kasr
ْ | # Sukun
ـ # Tatwil/Kashida
""", re.VERBOSE)
text = re.sub(noise, '', text)
return text.decode('utf-8')
if __name__ == '__main__':
conn = create_connection("/home/clepz/Documents/RisaleKarsilastirmali/Risaleler.db")
parser = MyHTMLParser(conn)
mypath = "/home/clepz/Documents/RisaleKarsilastirmali/AppKarsilastirmaliKitaplar9Subat2020/books"
langs = {'ing': 'EN', 'arabi': 'AR', 'rusca': 'RU', 'almanca': 'DE', 'ozbek': 'UZ',
'jap': 'JA', 'isp': 'ES', 'farsi': 'FA', 'endonezce': "ID", 'cince': "CN",
'fransizca': 'FR'}
folders = [folder for folder in listdir(mypath)] # if "ing" in folder ]
# test = {'ing':'en'}
for name, lang_code in langs.items():
folderNames = [i for i in folders if i.startswith(name)]
for f in folderNames:
print f
# kitabi ekle
book_id = insertBook(conn, (f, lang_code))
sectionsPath = mypath + "/" + f
print sectionsPath + " ----- " + str(book_id)
# kitap icerisindeki sectionlari al
sections = listdir(sectionsPath)
sections.sort()
for section in sections: # her sectionlarin icini oku ve databaseye ekle....
sectionName = re.findall("section-[0-9]+", section)
if sectionName.__len__() != 0:
parser.section_id = insertSection(conn, (book_id, sectionName[0]))
print sectionsPath + "/" + section
page = urllib.urlopen(sectionsPath + "/" + section).read()
parser.feed(page)
parser.sectionSonuEkleme()
# sectionlar bittiginde diger dosyayla devam et
# o dilin kitaplari bittiginde.
if (lang_code != 'JA'):
values = Counter(indexList)
insertIndex(conn, values, lang_code)
if (lang_code == 'EN'):
values = Counter(indexListTr)
insertIndex(conn, values, "TR")
print 'tr dili indexi eklendi.'
indexListTr = None
print lang_code + ' dili indexi eklendi'
indexList = []
# html = open("/home/clepz/Documents/RisaleKarsilastirmali/arabi23soz/arabi23soz-section-0-normal.html").read()
# paragraflar = html2text.html2text(html.decode("utf-8")).split('|')
# count = 0
"""
for prg in paragraflar:
if count % 2 == 0:
print prg
count = count + 1
"""
"""
listem = []
for string in page.split("<td class=\"col-TR\">"):
listem.append(string.split("</td>"))
print (listem[0])
"""
# print (type(page))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment