clepz/htmlToDbArapca.py

## htmlToDbArapca.py
# -*- coding: UTF-8 -*-
import sqlite3
from sqlite3 import Error
import urllib
import HTMLParser
import html2text
import re
from os import listdir
from os.path import isfile, join
from collections import Counter

indexList = []
indexListTr = []
lang_code = ''

inIgnorePrg = False


def create_connection(db_file):
    """ create a database connection to a SQLite database """
    conn = None
    try:
        conn = sqlite3.connect(db_file)
    except Error as e:
        print(e)
    return conn


def insertP(conn, values):
    sql = ''' INSERT INTO paragraphs (paragraph_name,section_id,paragraph_text_tr,paragraph_text_other) VALUES(?,?,?,?) '''
    with conn:
        cur = conn.cursor()
        cur.execute(sql, values)
        return cur.lastrowid


def insertBook(conn, values):
    sql = ''' INSERT INTO books (book_name,lang_code) VALUES(?,?) '''
    with conn:
        cur = conn.cursor()
        cur.execute(sql, values)
        return cur.lastrowid


def insertSection(conn, values):
    sql = ''' INSERT INTO booksSections (book_id,section_name) VALUES(?,?) '''
    with conn:
        cur = conn.cursor()
        cur.execute(sql, values)
        return cur.lastrowid


def insertIndex(conn, values, lang_code):
    sql = ''' INSERT INTO indexList (lang_code,word, count) VALUES(?,?,?)'''
    with conn:
        cur = conn.cursor()
        for value in sorted(values):
            if (value != ''):
                cur.execute(sql, (lang_code, value, values[value]))


def tireIsaretiDuzelt(value):
    for x in range(0, len(value)):
        if (value[x] != ''):
            if (value[x][0] == '-'):
                value[x] = value[x][1:-1]
                if (value[x] == ''):
                    continue
            if (value[x][-1] == '-'):
                value[x] = value[x][0:-2]
                if (value[x] == ''):
                    continue


class MyHTMLParser(HTMLParser.HTMLParser, object):
    def __init__(self, conn):
        super(MyHTMLParser, self).__init__()
        self.yazdir = False
        self.devam = False
        self.yazdirOther = False
        self.devamOther = False
        self.conn = conn
        self.prgSayi = 1
        self.cumle = ""
        self.baslik = False
        self.baslikOther = False
        self.attr = ""
        self.cumleOther = ""
        self.section_id = -1

    def handle_starttag(self, tag, attrs):
        # print("Found a start tag:", tag)
        global inIgnorePrg
        if (len(attrs) is not 0):
            if (tag == "tr" and "IgnorePrg" in attrs[0][1] ):
                inIgnorePrg = True

            elif (tag == "tr" and "row-Paragraph" in attrs[0][1]):
                inIgnorePrg = False

        if (len(attrs) is not 0):
            if (attrs[0][1] == "col-OTHER"):
                self.devamOther = True
        if (self.devamOther):
            if (tag == "p"):
                if inIgnorePrg:
                    baslik = False
                    for attr in attrs:
                        if "başlık" in attr[1].lower():
                            baslik = True
                    if not baslik:
                        self.devamOther = False
                        self.yazdirOther = False
                        return
                self.yazdirOther = True

        if (len(attrs) is not 0):
            if (attrs[0][1] == "col-TR"):
                self.devam = True
        if (self.devam):
            if (tag == "p"):

                if inIgnorePrg:
                    baslik = False
                    for attr in attrs:
                        if "başlık" in attr[1].lower():
                            baslik = True
                    if not baslik:
                        self.devam = False
                        self.yazdir = False
                        self.yazdirOther = False
                        return
                self.yazdir = True
                self.yazdirOther = False
                self.devamOther = False
                if self.attr == '':
                    for attr in attrs:
                        if attr[0] == "name":
                            self.attr = attr[1]
            if (tag == "a"):
                for attr in attrs:  # [  (class,value), (name,value)  ]
                    if (attr[0] == "name"):
                        self.attr = attr[1]
                        self.baslik = True
                        # print self.attr

        # diger dilden yazilari eklemek icin kullaniliyor.

    def handle_endtag(self, tag):
        if (tag == 'td' and self.cumle != ''):
            cumleKontrol = self.cumle.strip()
            self.cumle = self.cumle.replace("\n", " ")
            self.cumle = re.sub(u' +', ' ', self.cumle)
            self.cumle = re.sub(u'\\t+', ' ', self.cumle)
            self.cumle = re.sub(u'[‘’‚„\'`´’¿¡]', '', self.cumle)
            self.cumle = self.cumle.replace(u"I", u"ı")
            self.cumle = self.cumle.lower()
            self.cumle = self.cumle.replace(u'â', u'a').replace(u'û', u'u').replace(u'î', u'i')

            self.cumleOther = self.cumleOther.replace("\n", " ")
            self.cumleOther = re.sub(u' +', ' ', self.cumleOther)
            self.cumleOther = re.sub(u'\\t+', ' ', self.cumleOther)
            self.cumleOther = re.sub(u'[‘’‚„\'`´¿¡]', '', self.cumleOther)
            self.cumleOther = self.cumleOther.lower()
            if (u'\u2026' in cumleKontrol[-2:] or '*' in cumleKontrol[-2:] or ')' in cumleKontrol[-2:] or '.' in cumleKontrol[
                -1] or '.' in cumleKontrol[-3:] or '!' in cumleKontrol[-3:] or ':' in cumleKontrol[-3:] or '?' in cumleKontrol[
                                                                                                            -3:] or self.baslik):
                global indexList
                global indexListTr
                global lang_code
                if (lang_code == 'EN'):
                    indexListStringTr = nonLetterKarakterleriKaldir(self.cumle).strip()
                    indexListStringTr = arapcalariKaldir(indexListStringTr).strip()
                    indexListStringTr = indexListStringTr.split(' ')
                    tireIsaretiDuzelt(indexListStringTr)
                    indexListTr.extend(indexListStringTr)

                indexStringOther = nonLetterKarakterleriKaldir(self.cumleOther).strip()

                if (lang_code == "AR" or lang_code == "FA"):
                    indexStringOther = harekeleriKaldir(indexStringOther).strip()
                    self.cumleOther = harekeleriKaldir(self.cumleOther).strip()
                else:
                    indexStringOther = arapcalariKaldir(indexStringOther).strip()

                indexStringOther = indexStringOther.split(' ')
                tireIsaretiDuzelt(indexStringOther)

                indexList.extend(indexStringOther)

                self.cumle = self.cumle.strip()
                self.cumleOther = self.cumleOther.strip()
                self.cumle = " " + self.cumle
                self.cumleOther = " " + self.cumleOther
                self.yazdir = False
                self.devam = False
                insertP(self.conn, (self.attr.decode(encoding="utf-8"), self.section_id, self.cumle, self.cumleOther))

                self.yazdirOther = False
                self.devamOther = False
                # print self.cumleOther
                self.cumleOther = ''

                # print self.cumle
                # print '--------------------------------'
                self.cumle = ''
                self.attr = ''
                if self.baslik:
                    self.baslik = False
            else:
                self.devam = False
                self.yazdir = False

                self.devamOther = False
                self.yazdirOther = False

    def handle_data(self, data):
        data = unicode(data, 'utf-8')
        if (self.yazdir):
            self.cumle += data
        if (self.yazdirOther):
            self.cumleOther += data

    def sectionSonuEkleme(self):
        if self.cumle != '':
            self.yazdir = False
            self.devam = False
            self.yazdirOther = False
            self.devamOther = False
            # print self.cumleOther
            insertP(self.conn, (self.attr.decode(encoding="utf-8"), self.section_id, self.cumle, self.cumleOther))
            self.cumleOther = ''
            # print self.cumle
            # print '--------------------------------'
            self.cumle = ''
            self.attr = ''
            if self.baslik:
                self.baslik = False
            if self.baslikOther:
                self.baslikOther = False


def nonLetterKarakterleriKaldir(text):
    text = re.sub(r'[,.)(«!:?/۞»;#◌\*\}\{\[\]\'\"]|[0-9]', '', text)
    text = re.sub(u"[؛،”“…'ِ,﴾﴿ﷺ◌◌◌—]", "", text, flags=re.UNICODE)
    return text


def arapcalariKaldir(text):
    """
    test = unicode("شدَد", encoding='utf-8')
    test = re.sub(u"[\u064e\u0634]", "", test,  flags=re.UNICODE)
    """
    text = re.sub(
        u"[\u0600-\u06ff]|[\u0750-\u077f]|[\ufb50-\ufbc1]|[\ufbd3-\ufd3f]|[\ufd50-\ufd8f]|[\ufd92-\ufdc7]|[\ufe70-\ufefc]|[\uFDF0-\uFDFD]",
        '', text, flags=re.UNICODE)
    return text


def harekeleriKaldir(text):
    text = text.encode('utf-8')
    noise = re.compile(""" ّ    | # Tashdid
                             َ    | # Fatha
                             ً    | # Tanwin Fath
                             ُ    | # Damma
                             ٌ    | # Tanwin Damm
                             ِ    | # Kasra
                             ٍ    | # Tanwin Kasr
                             ْ    | # Sukun
                             ـ     # Tatwil/Kashida
                         """, re.VERBOSE)
    text = re.sub(noise, '', text)
    return text.decode('utf-8')


if __name__ == '__main__':
    conn = create_connection("/home/clepz/Documents/RisaleKarsilastirmali/Risaleler.db")

    parser = MyHTMLParser(conn)

    mypath = "/home/clepz/Documents/RisaleKarsilastirmali/AppKarsilastirmaliKitaplar9Subat2020/books"

    langs = {'ing': 'EN', 'arabi': 'AR', 'rusca': 'RU', 'almanca': 'DE', 'ozbek': 'UZ',
             'jap': 'JA', 'isp': 'ES', 'farsi': 'FA', 'endonezce': "ID", 'cince': "CN",
             'fransizca': 'FR'}

    folders = [folder for folder in listdir(mypath)]  # if "ing" in folder ]
    #     test = {'ing':'en'}
    for name, lang_code in langs.items():
        folderNames = [i for i in folders if i.startswith(name)]
        for f in folderNames:
            print f
            # kitabi ekle
            book_id = insertBook(conn, (f, lang_code))
            sectionsPath = mypath + "/" + f
            print sectionsPath + " ----- " + str(book_id)
            # kitap icerisindeki sectionlari al
            sections = listdir(sectionsPath)
            sections.sort()
            for section in sections:  # her sectionlarin icini oku ve databaseye ekle....
                sectionName = re.findall("section-[0-9]+", section)
                if sectionName.__len__() != 0:
                    parser.section_id = insertSection(conn, (book_id, sectionName[0]))
                    print sectionsPath + "/" + section
                    page = urllib.urlopen(sectionsPath + "/" + section).read()
                    parser.feed(page)
                    parser.sectionSonuEkleme()
            # sectionlar bittiginde diger dosyayla devam et
        # o dilin kitaplari bittiginde.
        if (lang_code != 'JA'):
            values = Counter(indexList)
            insertIndex(conn, values, lang_code)
            if (lang_code == 'EN'):
                values = Counter(indexListTr)
                insertIndex(conn, values, "TR")
                print 'tr dili indexi eklendi.'
                indexListTr = None
            print lang_code + ' dili indexi eklendi'
        indexList = []

    # html = open("/home/clepz/Documents/RisaleKarsilastirmali/arabi23soz/arabi23soz-section-0-normal.html").read()
    # paragraflar = html2text.html2text(html.decode("utf-8")).split('|')
    # count = 0

    """
    for prg in paragraflar:
        if count % 2 == 0:
            print prg
        count = count + 1
    """
    """
    listem = []
    for string in page.split("<td class=\"col-TR\">"):
        listem.append(string.split("</td>"))
    print (listem[0])
    """

    # print (type(page))
	# -- coding: UTF-8 --
	import sqlite3
	from sqlite3 import Error
	import urllib
	import HTMLParser
	import html2text
	import re
	from os import listdir
	from os.path import isfile, join
	from collections import Counter

	indexList = []
	indexListTr = []
	lang_code = ''

	inIgnorePrg = False


	def create_connection(db_file):
	""" create a database connection to a SQLite database """
	conn = None
	try:
	conn = sqlite3.connect(db_file)
	except Error as e:
	print(e)
	return conn


	def insertP(conn, values):
	sql = ''' INSERT INTO paragraphs (paragraph_name,section_id,paragraph_text_tr,paragraph_text_other) VALUES(?,?,?,?) '''
	with conn:
	cur = conn.cursor()
	cur.execute(sql, values)
	return cur.lastrowid


	def insertBook(conn, values):
	sql = ''' INSERT INTO books (book_name,lang_code) VALUES(?,?) '''
	with conn:
	cur = conn.cursor()
	cur.execute(sql, values)
	return cur.lastrowid


	def insertSection(conn, values):
	sql = ''' INSERT INTO booksSections (book_id,section_name) VALUES(?,?) '''
	with conn:
	cur = conn.cursor()
	cur.execute(sql, values)
	return cur.lastrowid


	def insertIndex(conn, values, lang_code):
	sql = ''' INSERT INTO indexList (lang_code,word, count) VALUES(?,?,?)'''
	with conn:
	cur = conn.cursor()
	for value in sorted(values):
	if (value != ''):
	cur.execute(sql, (lang_code, value, values[value]))


	def tireIsaretiDuzelt(value):
	for x in range(0, len(value)):
	if (value[x] != ''):
	if (value[x][0] == '-'):
	value[x] = value[x][1:-1]
	if (value[x] == ''):
	continue
	if (value[x][-1] == '-'):
	value[x] = value[x][0:-2]
	if (value[x] == ''):
	continue


	class MyHTMLParser(HTMLParser.HTMLParser, object):
	def __init__(self, conn):
	super(MyHTMLParser, self).__init__()
	self.yazdir = False
	self.devam = False
	self.yazdirOther = False
	self.devamOther = False
	self.conn = conn
	self.prgSayi = 1
	self.cumle = ""
	self.baslik = False
	self.baslikOther = False
	self.attr = ""
	self.cumleOther = ""
	self.section_id = -1

	def handle_starttag(self, tag, attrs):
	# print("Found a start tag:", tag)
	global inIgnorePrg
	if (len(attrs) is not 0):
	if (tag == "tr" and "IgnorePrg" in attrs[0][1] ):
	inIgnorePrg = True

	elif (tag == "tr" and "row-Paragraph" in attrs[0][1]):
	inIgnorePrg = False

	if (len(attrs) is not 0):
	if (attrs[0][1] == "col-OTHER"):
	self.devamOther = True
	if (self.devamOther):
	if (tag == "p"):
	if inIgnorePrg:
	baslik = False
	for attr in attrs:
	if "başlık" in attr[1].lower():
	baslik = True
	if not baslik:
	self.devamOther = False
	self.yazdirOther = False
	return
	self.yazdirOther = True

	if (len(attrs) is not 0):
	if (attrs[0][1] == "col-TR"):
	self.devam = True
	if (self.devam):
	if (tag == "p"):

	if inIgnorePrg:
	baslik = False
	for attr in attrs:
	if "başlık" in attr[1].lower():
	baslik = True
	if not baslik:
	self.devam = False
	self.yazdir = False
	self.yazdirOther = False
	return
	self.yazdir = True
	self.yazdirOther = False
	self.devamOther = False
	if self.attr == '':
	for attr in attrs:
	if attr[0] == "name":
	self.attr = attr[1]
	if (tag == "a"):
	for attr in attrs: # [ (class,value), (name,value) ]
	if (attr[0] == "name"):
	self.attr = attr[1]
	self.baslik = True
	# print self.attr

	# diger dilden yazilari eklemek icin kullaniliyor.

	def handle_endtag(self, tag):
	if (tag == 'td' and self.cumle != ''):
	cumleKontrol = self.cumle.strip()
	self.cumle = self.cumle.replace("\n", " ")
	self.cumle = re.sub(u' +', ' ', self.cumle)
	self.cumle = re.sub(u'\\t+', ' ', self.cumle)
	self.cumle = re.sub(u'[‘’‚„\'`´’¿¡]', '', self.cumle)
	self.cumle = self.cumle.replace(u"I", u"ı")
	self.cumle = self.cumle.lower()
	self.cumle = self.cumle.replace(u'â', u'a').replace(u'û', u'u').replace(u'î', u'i')

	self.cumleOther = self.cumleOther.replace("\n", " ")
	self.cumleOther = re.sub(u' +', ' ', self.cumleOther)
	self.cumleOther = re.sub(u'\\t+', ' ', self.cumleOther)
	self.cumleOther = re.sub(u'[‘’‚„\'`´¿¡]', '', self.cumleOther)
	self.cumleOther = self.cumleOther.lower()
	if (u'\u2026' in cumleKontrol[-2:] or '*' in cumleKontrol[-2:] or ')' in cumleKontrol[-2:] or '.' in cumleKontrol[
	-1] or '.' in cumleKontrol[-3:] or '!' in cumleKontrol[-3:] or ':' in cumleKontrol[-3:] or '?' in cumleKontrol[
	-3:] or self.baslik):
	global indexList
	global indexListTr
	global lang_code
	if (lang_code == 'EN'):
	indexListStringTr = nonLetterKarakterleriKaldir(self.cumle).strip()
	indexListStringTr = arapcalariKaldir(indexListStringTr).strip()
	indexListStringTr = indexListStringTr.split(' ')
	tireIsaretiDuzelt(indexListStringTr)
	indexListTr.extend(indexListStringTr)

	indexStringOther = nonLetterKarakterleriKaldir(self.cumleOther).strip()

	if (lang_code == "AR" or lang_code == "FA"):
	indexStringOther = harekeleriKaldir(indexStringOther).strip()
	self.cumleOther = harekeleriKaldir(self.cumleOther).strip()
	else:
	indexStringOther = arapcalariKaldir(indexStringOther).strip()

	indexStringOther = indexStringOther.split(' ')
	tireIsaretiDuzelt(indexStringOther)

	indexList.extend(indexStringOther)

	self.cumle = self.cumle.strip()
	self.cumleOther = self.cumleOther.strip()
	self.cumle = " " + self.cumle
	self.cumleOther = " " + self.cumleOther
	self.yazdir = False
	self.devam = False
	insertP(self.conn, (self.attr.decode(encoding="utf-8"), self.section_id, self.cumle, self.cumleOther))

	self.yazdirOther = False
	self.devamOther = False
	# print self.cumleOther
	self.cumleOther = ''

	# print self.cumle
	# print '--------------------------------'
	self.cumle = ''
	self.attr = ''
	if self.baslik:
	self.baslik = False
	else:
	self.devam = False
	self.yazdir = False

	self.devamOther = False
	self.yazdirOther = False

	def handle_data(self, data):
	data = unicode(data, 'utf-8')
	if (self.yazdir):
	self.cumle += data
	if (self.yazdirOther):
	self.cumleOther += data

	def sectionSonuEkleme(self):
	if self.cumle != '':
	self.yazdir = False
	self.devam = False
	self.yazdirOther = False
	self.devamOther = False
	# print self.cumleOther
	insertP(self.conn, (self.attr.decode(encoding="utf-8"), self.section_id, self.cumle, self.cumleOther))
	self.cumleOther = ''
	# print self.cumle
	# print '--------------------------------'
	self.cumle = ''
	self.attr = ''
	if self.baslik:
	self.baslik = False
	if self.baslikOther:
	self.baslikOther = False


	def nonLetterKarakterleriKaldir(text):
	text = re.sub(r'[,.)(«!:?/۞»;#◌\*\}\{\[\]\'\"]\|[0-9]', '', text)
	text = re.sub(u"[؛،”“…'ِ,﴾﴿ﷺ◌◌◌—]", "", text, flags=re.UNICODE)
	return text


	def arapcalariKaldir(text):
	"""
	test = unicode("شدَد", encoding='utf-8')
	test = re.sub(u"[\u064e\u0634]", "", test, flags=re.UNICODE)
	"""
	text = re.sub(
	u"[\u0600-\u06ff]\|[\u0750-\u077f]\|[\ufb50-\ufbc1]\|[\ufbd3-\ufd3f]\|[\ufd50-\ufd8f]\|[\ufd92-\ufdc7]\|[\ufe70-\ufefc]\|[\uFDF0-\uFDFD]",
	'', text, flags=re.UNICODE)
	return text


	def harekeleriKaldir(text):
	text = text.encode('utf-8')
	noise = re.compile(""" ّ \| # Tashdid
	َ \| # Fatha
	ً \| # Tanwin Fath
	ُ \| # Damma
	ٌ \| # Tanwin Damm
	ِ \| # Kasra
	ٍ \| # Tanwin Kasr
	ْ \| # Sukun
	ـ # Tatwil/Kashida
	""", re.VERBOSE)
	text = re.sub(noise, '', text)
	return text.decode('utf-8')


	if __name__ == '__main__':
	conn = create_connection("/home/clepz/Documents/RisaleKarsilastirmali/Risaleler.db")

	parser = MyHTMLParser(conn)

	mypath = "/home/clepz/Documents/RisaleKarsilastirmali/AppKarsilastirmaliKitaplar9Subat2020/books"

	langs = {'ing': 'EN', 'arabi': 'AR', 'rusca': 'RU', 'almanca': 'DE', 'ozbek': 'UZ',
	'jap': 'JA', 'isp': 'ES', 'farsi': 'FA', 'endonezce': "ID", 'cince': "CN",
	'fransizca': 'FR'}

	folders = [folder for folder in listdir(mypath)] # if "ing" in folder ]
	# test = {'ing':'en'}
	for name, lang_code in langs.items():
	folderNames = [i for i in folders if i.startswith(name)]
	for f in folderNames:
	print f
	# kitabi ekle
	book_id = insertBook(conn, (f, lang_code))
	sectionsPath = mypath + "/" + f
	print sectionsPath + " ----- " + str(book_id)
	# kitap icerisindeki sectionlari al
	sections = listdir(sectionsPath)
	sections.sort()
	for section in sections: # her sectionlarin icini oku ve databaseye ekle....
	sectionName = re.findall("section-[0-9]+", section)
	if sectionName.__len__() != 0:
	parser.section_id = insertSection(conn, (book_id, sectionName[0]))
	print sectionsPath + "/" + section
	page = urllib.urlopen(sectionsPath + "/" + section).read()
	parser.feed(page)
	parser.sectionSonuEkleme()
	# sectionlar bittiginde diger dosyayla devam et
	# o dilin kitaplari bittiginde.
	if (lang_code != 'JA'):
	values = Counter(indexList)
	insertIndex(conn, values, lang_code)
	if (lang_code == 'EN'):
	values = Counter(indexListTr)
	insertIndex(conn, values, "TR")
	print 'tr dili indexi eklendi.'
	indexListTr = None
	print lang_code + ' dili indexi eklendi'
	indexList = []

	# html = open("/home/clepz/Documents/RisaleKarsilastirmali/arabi23soz/arabi23soz-section-0-normal.html").read()
	# paragraflar = html2text.html2text(html.decode("utf-8")).split('\|')
	# count = 0

	"""
	for prg in paragraflar:
	if count % 2 == 0:
	print prg
	count = count + 1
	"""
	"""
	listem = []
	for string in page.split("<td class=\"col-TR\">"):
	listem.append(string.split("</td>"))
	print (listem[0])
	"""

	# print (type(page))