-
-
Save LeNarvalo/b024ec6f4e23cc2500e180d2b3df0fc3 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf8 -*- | |
import urllib, webbrowser | |
import unicodedata | |
import threading | |
from Tkinter import * | |
import ttk | |
#import tkMessageBox | |
import os | |
import time | |
import re | |
#windowsAnswer = False | |
#def windowsAlert(): | |
# global windowsAnswer | |
# windowsAnswer = tkMessageBox.askyesno("Continuer?", "Le script va ouvrir un nombre de page important, voulez vous continuer?") | |
master=Tk() | |
master.configure(background='#535353') | |
###VARIABLES### | |
global derTxt, texto, auteurSaved | |
auteurSaved = "" | |
derTxt = "" | |
displaySignature = False | |
displayVideo = False | |
displayImage = False | |
listResult = [] | |
listCom = [] | |
dico = {} | |
disableDisplay = True | |
chemin = os.path.expanduser('~/Veganisme') | |
balises=["<blockquote>",'<div class="xoopsQuote">',"<br />"] | |
accents=[["\xc3\xa7","ç"],["\xc3\x87","ç"],["\xe2\x80\x99","'"],["\xc3\xa9","é"],["\xc3\xa0","à"],["\xc3\xa8","è"],["\xc3\xb4","ô"],["\xc3\xb9","ù"],["\xc3\xaa","ê"],\ | |
["\xc3\xae","î"],["\xc3\xaf","ï"]] | |
accents2=[["\\xc3\\xa7","\xc3\xa7"],["\\xc3\\xae","\xc3\xae"],["\\xc3\\x87","\xc3\x87"],["\\xe2\\x80\\x99","\xe2\x80\x99"],["\\xc3\\xa9","\xc3\xa9"],["\\xc3\\xa0","\xc3\xa0"],\ | |
["\\xc3\\xa8","\xc3\xa8"],["\\xc3\\xb4","\xc3\xb4"],["\\xc3\\xb9","\xc3\xb9"],["\\xc3\\xaa","\xc3\xaa"],["\\xc3\\xaf","\xc3\xaf"]] | |
d = ['''<!DOCTYPE html> | |
<style> | |
table { | |
width:100%; | |
} | |
table, th, td { | |
border: 1px solid black; | |
border-collapse: collapse; | |
} | |
th, td { | |
padding: 15px; | |
text-align: left; | |
} | |
table#t01 tr:nth-child(even) { | |
background-color: #eee; | |
} | |
table#t01 tr:nth-child(odd) { | |
background-color: #fff; | |
} | |
</style> | |
<body> | |
<table id="t01"> | |
'''] | |
############### | |
pageTest=urllib.urlopen('https://www.koreus.com/modules/newbb/topic160787.html') | |
strpageTest=pageTest.read() | |
splitText = '''<b>(1)</b> <a href="/modules/newbb/topic160787-20.html">2</a> <a href="/modules/newbb/topic160787-40.html">3</a> <a href="/modules/newbb/topic160787-60.html">4</a> ... <a ''' | |
listSST = strpageTest.split(splitText) | |
listEST = listSST[1].split("</a>") | |
listNSST = listEST[0].split(">") | |
nbOfPages = int(listNSST[1]) | |
def createFPList(strpage): | |
#LISTE N° FORUMPOST | |
global listCom #Liste de forumpost en string | |
a = strpage.split('''<a id="f''') | |
for l in a: | |
if l.startswith('orumpost'): | |
comNb= l.split('">')[0] | |
listCom.append(comNb[8:]) | |
def getDicoFromStr(fileStr): | |
#OBTIENT DICO DEPUIS FICHIER SUJET.HTML EN STR UTF8 | |
global auteurList, commentList, dico, der, derTxt | |
dico = {} | |
list = fileStr.split("': ['") | |
auteurList = ["Olrik"] | |
commentList = [] | |
for line in list[1:]: | |
try: | |
try: | |
auteur = line[line.index("'], '")+5:] | |
if auteur not in auteurList: | |
auteurList.append(auteur) | |
except: | |
auteur = line[line.index("']}")+3:] | |
if auteur not in auteurList: | |
auteurList.append(auteur) | |
try: | |
commentList.append(line[:line.index("'], '")]) | |
except: | |
commentList.append(line[:line.index("']}")]) | |
except: | |
derTxt = "\n"+"Bug général" | |
texto.insert(END, derTxt) | |
id = 0 | |
for auteur in auteurList[:-1]: | |
list = commentList[id].split("', '") | |
pgGot = False | |
for comm in list: | |
if len(comm) == 0: | |
continue | |
try: | |
if not pgGot: | |
page = int(comm) | |
pgGot = True | |
elif pgGot: | |
fpCom = int(comm) | |
pgGot = False | |
try: | |
dico[auteur].extend([str(page),str(fpCom)]) | |
except: | |
dico[auteur]=[str(page),str(fpCom)] | |
except: | |
for a in accents2: | |
comm = comm.replace(a[0],a[1]) | |
try : | |
dico[auteur].append(comm) | |
except: | |
dico[auteur]=comm | |
id+=1 | |
return dico | |
def rechercher(): | |
#ENREGISTRE LE SUJET/MET A JOUR L'ARCHIVAGE/RECHERCHE L'EXTRAIT DU TEXTE EN FONCTION DE L'AUTEUR | |
global listCom, dico, texto, derTxt, research_thread | |
lastPage=int(nbOfPages)*20 #LAST PAGE + (1*20) | |
#PREMIERE UTILISATION (PAS D'ARCHIVE SUR LE DISQUE DUR) | |
if not os.path.isfile(chemin+'\\Sujet.html'): | |
if not os.path.exists(chemin): | |
os.mkdir(chemin) | |
file = open(chemin+"\\Sujet.html","w") | |
file.close() | |
listCom = [] | |
page=urllib.urlopen('https://www.koreus.com/modules/newbb/topic160787.html') | |
strpage=page.read() | |
suf = range(0,9999, 20) | |
countPage = 0 | |
countComm = -1 | |
while suf[countPage]!=lastPage: | |
lastLenPage=len(strpage) | |
createFPList(strpage) | |
#DICO DES COMM/AUTEUR | |
b = strpage.split('href="/memb') | |
for c in b: | |
if c.startswith('re/'): | |
countComm += 1 | |
e = c.split('">') | |
i = e[1].split('</a>') | |
auteur = i[0] | |
f = c.split('<div class="comText') | |
f1 = f[1] | |
h = f1.split('</div>\r\n\t <br clear="all" />') #h[0] = ComText uniquement | |
h02 = h[0][2:] | |
##REMOVE IMG | |
while "<img" in h02: | |
idS = h02.index("<img") | |
idE = h02[idS:].index(">") | |
h02=h02.replace(h02[idS:idS+idE+1],"") | |
##REMOVE VID | |
h3 = "" | |
while "<iframe" in h02 : | |
idS = h02.index("<iframe") | |
idE = h02[idS+10:].index(">") | |
h02=h02.replace(h02[idS:idS+idE+11],"") | |
try: | |
dico[auteur].extend([str(suf[countPage]),listCom[countComm],h02]) | |
except: | |
dico[auteur]=[str(suf[countPage]),listCom[countComm],h02] | |
countPage+=1 | |
page=urllib.urlopen('https://www.koreus.com/modules/newbb/topic160787-'+str(suf[countPage])+'.html') | |
strpage=page.read() | |
pCurrent['value'] = countPage | |
#ARCHIVAGE | |
file = open(chemin+"\\Sujet.html","w") | |
file.write('<meta charset="UTF-8">') | |
file.write(str(dico)) | |
file.write("\n"+str(lastLenPage)) | |
file.close() | |
#ARCHIVAGE DEJA EXISTANT | |
else: | |
if len(dico) < 1: | |
file = open(chemin+"\\Sujet.html","r") | |
fileStr = file.read() | |
file.close() | |
dico = getDicoFromStr(fileStr) | |
#VERIFICATION DE LA MISE A JOUR | |
listCom = [] | |
suf = range(1960,9999, 20) | |
countPage = 0 | |
page=urllib.urlopen('https://www.koreus.com/modules/newbb/topic160787-'+str(suf[countPage])+'.html') | |
strpage=page.read() | |
news = True | |
file = open(chemin+"\\Sujet.html","r") | |
fileList = file.readlines() | |
file.close() | |
lastLenPage = int(fileList[-1]) | |
while suf[countPage]!=lastPage: | |
if len(strpage) == lastLenPage: | |
news = False | |
if len(strpage) != lastLenPage: | |
news = True | |
countPage+=1 | |
page=urllib.urlopen('https://www.koreus.com/modules/newbb/topic160787-'+str(suf[countPage])+'.html') | |
strpage=page.read() | |
if not news: | |
derTxt = "\n"+"BASE A JOUR!" | |
texto.insert(END, derTxt) | |
if news: | |
derTxt = "\n"+"MISE A JOUR DE LA BASE..." | |
texto.insert(END, derTxt) | |
page=urllib.urlopen('https://www.koreus.com/modules/newbb/topic160787-'+str(suf[countPage])+'.html') | |
strpage=page.read() | |
suf = range(1980,9999, 20) | |
countPage = 0 | |
countComm = -1 | |
while suf[countPage]!=lastPage: | |
createFPList(strpage) | |
lastLenPage=len(strpage) | |
#DICO DES COMM/AUTEUR | |
b = strpage.split('href="/memb') | |
for c in b: | |
if c.startswith('re/'): | |
countComm += 1 | |
e = c.split('">') | |
i = e[1].split('</a>') | |
auteur = i[0] | |
f = c.split('<div class="comText') | |
f1 = f[1] | |
h = f1.split('</div>\r\n\t <br clear="all" />') #h[0] = ComText uniquement | |
h02 = h[0][2:] | |
##REMOVE IMG | |
while "<img" in h02: | |
idS = h02.index("<img") | |
idE = h02[idS:].index(">") | |
h02=h02.replace(h02[idS:idS+idE+1],"") | |
##REMOVE VID | |
h3 = "" | |
while "<iframe" in h02 : | |
idS = h02.index("<iframe") | |
idE = h02[idS+10:].index(">") | |
h02=h02.replace(h02[idS:idS+idE+11],"") | |
if auteur in dico: | |
if listCom[countComm] not in dico[auteur]: | |
dico[auteur].extend([str(suf[countPage]),listCom[countComm],h02]) | |
derTxt = "\n"+"ECRITURE EN COURS" | |
texto.insert(END, derTxt) | |
else: | |
dico[auteur]=[str(suf[countPage]),listCom[countComm],h02] | |
countPage+=1 | |
page=urllib.urlopen('https://www.koreus.com/modules/newbb/topic160787-'+str(suf[countPage])+'.html') | |
strpage=page.read() | |
file = open(chemin+"\\Sujet.html","w") | |
file.write('<meta charset="UTF-8">') | |
file.write(str(dico)) | |
file.write("\n"+str(lastLenPage)) | |
file.close() | |
research_thread = threading.Thread(target=research) | |
research_thread.daemon = True | |
research_thread.start() | |
def research(): | |
global listResult, dico, texto, derTxt, auteurSaved, result, entree2, auteur2 | |
#RECHERCHER PAR AUTEUR L'EXTRAIT | |
try: | |
derTxt = "\n"+"RECHERCHE EN COURS..." | |
texto.insert(END, derTxt) | |
pCurrent['mode'] = "indeterminate" | |
pCurrent.start(1) | |
listResult = [] | |
result = 0 | |
for key in dico.keys(): | |
if auteur2 in key.lower() or auteur==u"*": | |
if auteur != u"*": | |
auteurSaved = key | |
else: | |
auteurSaved = "Auteur inconnu" | |
for comm in dico[key]: | |
try: | |
if int(comm)<200000: | |
page = comm | |
except: | |
None | |
try: | |
if int(comm)>200000: | |
num = comm | |
except: | |
None | |
try: | |
comm1 = comm | |
for a in accents: | |
comm1 = comm1.replace(a[0],a[1]) | |
if not casse.get(): | |
try: | |
comm2 = unicodedata.normalize('NFKD', comm1.decode('utf8').lower()).encode('ascii', 'ignore') | |
except: | |
comm2 = comm1.lower() #Inutile normalement | |
else: | |
comm2 = comm1 | |
if entree2.encode('utf8') in comm2 and len(comm.split())>1 : | |
if result < 200: | |
derTxt = "\n"+"PAGE:"+str(int(page)/20)+" FORUMPOST :"+str(num) | |
texto.insert(END, derTxt) | |
result += 1 | |
listResult.append([page,num,comm,key]) | |
#if result % 80 == 0: | |
# derTxt = "\n"+"PAUSE REQUIERED" | |
# texto.insert(END, derTxt) | |
# time.sleep(1) | |
except: | |
None | |
if auteur2 in key.lower(): | |
break | |
buildPageWeb_thread = threading.Thread(target=buildPageWeb) | |
buildPageWeb_thread.daemon = True | |
buildPageWeb_thread.start() | |
if auteur==u"*": | |
return | |
derTxt = "\n"+"AUTEUR NON TROUVVE" | |
texto.insert(END, derTxt) | |
pCurrent.stop() | |
except: | |
None | |
def buildPageWeb(afficherPlusDe200=False): | |
global texto, derTxt, entree2 | |
#if research_thread.isAlive(): | |
# time.sleep(2) | |
# buildPageWeb() | |
pCurrent.stop() | |
if result >= 200 and not afficherPlusDe200: | |
derTxt="" | |
texto.delete(1.0, END) | |
derTxt = "\n"+"Nombre de resultats : >200" | |
texto.insert(END, derTxt) | |
Afficher['state']='normal' | |
return | |
if result < 200: | |
derTxt = "\n"+"Nombre de resultats : "+str(result) | |
texto.insert(END, derTxt) | |
e = '''<meta charset="UTF-8"> | |
<!DOCTYPE html> | |
<html> | |
<head> | |
<style> | |
table { | |
width:100%; | |
} | |
table, th, td { | |
border: 1px solid black; | |
border-collapse: collapse; | |
} | |
th, td { | |
padding: 15px; | |
text-align: left; | |
} | |
table#t01 tr:nth-child(even) { | |
background-color: #e0dfe7; | |
} | |
table#t01 tr:nth-child(odd) { | |
background-color: #fff; | |
} | |
</style> | |
</head><table id="t01" cellpadding="3px" cellspacing="0px" rules="all" style="border:solid 1px black; border-collapse:collapse; text-align:center;"> ''' | |
if casse.get(): | |
entree2 = entree2.encode('utf8') | |
pageWeb = open(chemin+"\\pageWeb.html","w") | |
pageWeb.write(e) | |
pageWeb.write('''<tr> | |
<th colspan="2" style="width:140px;background-color:#9a9ace"><FONT color="#fff">'''+auteurSaved+''' - Extrait du texte recherché : '''+entree2+''' - <U>Nombre de résultat(s)</U> : '''+str(result)+'''</FONT></th> | |
</tr>''') | |
for list in listResult: | |
txt = list[2] | |
for b in balises: | |
txt = txt.replace(b,"") | |
for a in accents: | |
txt = txt.replace(a[0],a[1]) | |
try: | |
id1=txt[:550][::-1].index('a<') | |
except: | |
id1=9999 | |
try: | |
id2=txt[:550][::-1].index('>a/') | |
except: | |
id2=9999 | |
if id2 > id1: | |
txt2 = txt[:550-id1-1] | |
else: | |
txt2 = txt[:550] | |
pageWeb.write('<tr><td>'+txt2+'<div align="right"><font face="verdana" color="orange" size="2">-<i><b>'+str(list[3])+'</b></i></font></div></td><td><a href="https://www.koreus.com/modules/newbb/topic160787-'+str(list[0])+'.html#forumpost'+str(list[1])+'"> Page '+str(int(list[0])/20)+' / ForumPost.'+str(list[1])+'</a></td></tr>') | |
pageWeb.close() | |
displayPageWeb = threading.Thread(target=displaySearch) | |
displayPageWeb.daemon = True | |
displayPageWeb.start() | |
def displaySearch(): | |
os.startfile("C:/Users/LeNa/Veganisme/pageWeb.html") | |
Afficher['state']='disabled' | |
def init2(): | |
global search_thread, derTxt, texto, entree2, auteur2, auteur | |
derTxt="" | |
texto.delete(1.0, END) | |
pCurrent['mode'] = "determinate" | |
Afficher['state']='disabled' | |
if len(entreeAuteur.get())==0 or len(entreeText.get())==0: | |
derTxt = "VEUILLEZ SAISIR UN NOM D'AUTEUR ET UN EXTRAIT DU TEXTE QU'IL AURAIT SAISI"\ | |
+"\nTAPEZ * POUR REMPLACER LE NOM DE L'AUTEUR SI VOUS NE SAVEZ PAS" | |
texto.insert(END, derTxt) | |
return | |
if len(entreeText.get())<5: | |
derTxt = "\n"+"L'EXTRAIT DE TEXTE EST TROP COURT (MIN 5 CARACTERES)" | |
texto.insert(END, derTxt) | |
return | |
auteur = entreeAuteur.get() #Unicode si accent, string sinon | |
try: | |
auteur2 = unicodedata.normalize('NFKD', auteur).encode('ascii', 'ignore').lower() | |
except: | |
auteur2 = auteur.lower() | |
entree = entreeText.get() #Unicode si accent, string sinon | |
if not casse.get(): | |
try: | |
entree2 = unicodedata.normalize('NFKD', entree).encode('ascii', 'ignore').lower() | |
except: | |
entree2 = entree.lower() | |
else: | |
entree2 = entree | |
search_thread = threading.Thread(target=rechercher) | |
search_thread.start() | |
def afficherPlusDe200(): | |
buildPageWeb(True) | |
def fenetre(): | |
global Afficher, texto, pCurrent, entreeAuteur, entreeText, Rechercher, scrollbar, casse | |
t1 = Frame(master).grid(row=0) | |
auteur_label = Label(t1, text="Auteur :",bg='#535353',fg="white")#.pack() | |
auteur_label.grid(row=0,column=0, sticky=W) | |
entreeAuteur = Entry(t1, width=50) | |
entreeAuteur.grid(row=0,column=1, columnspan=3, sticky=W+E) | |
t2 = Frame(master).grid(row=1) | |
text_label = Label(t2, text="Texte :",bg='#535353',fg="white")#.pack() | |
text_label.grid(row=1,column=0, sticky=W) | |
entreeText = Entry(t2, width=50) | |
entreeText.grid(row=1,column=1, columnspan=3, sticky=W+E) | |
t3 = Frame(master).grid(row=2,pady=20) | |
casse = IntVar() | |
bouton=Checkbutton(t3, text="Casse", variable=casse,bg='#535353',activebackground='#535353',fg="#FFFFFF",selectcolor="black") | |
bouton.grid(row=2) | |
Rechercher = Button(t3, text ='Rechercher', command=init2,width=20)#.pack(fill=BOTH,padx=20,pady=10) | |
Rechercher.grid(row=2,column=1, sticky=E) | |
Afficher = Button(master,width=20, compound=LEFT, overrelief=GROOVE, text ='Afficher > 200 résultats', fg="red", command=afficherPlusDe200, state=DISABLED) | |
Afficher.grid(row=2,column=2, sticky=W) | |
t4 = Frame(master).grid(row=3) | |
pCurrent = ttk.Progressbar(t4, orient='horizontal', mode='determinate', value=5, maximum=nbOfPages) | |
pCurrent.grid(row=3, column=0,columnspan=4, sticky=W+E)#.pack(fill=BOTH, pady=10) | |
t5 = Frame(master).grid(row=4) | |
scrollbar = Scrollbar(t5) | |
scrollbar.grid(row=4, column=3,sticky=W+S+N)#.pack(side=RIGHT, fill=Y) | |
texto = Text(t5, wrap=WORD, yscrollcommand=scrollbar.set, width=50) | |
texto.grid(row=4,column=0, columnspan=3, sticky=W+E)#.pack(side=LEFT) | |
scrollbar.config(command=texto.yview) | |
fenetre() | |
master.mainloop() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment