-
-
Save LeNarvalo/6bc6cb673182fdab8a891f51cd617b2b to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf8 -*- | |
import urllib, webbrowser | |
import unicodedata | |
import threading | |
from Tkinter import * | |
import ttk | |
import tkMessageBox | |
import os | |
import time | |
import re | |
master=Tk() | |
master.configure(background='#535353') | |
###VARIABLES### | |
global derTxt, texto, auteurSaved | |
auteurSaved = "" | |
derTxt = "" | |
displaySignature = False | |
displayVideo = False | |
displayImage = False | |
listResult = [] | |
listCom = [] | |
dico = {} | |
disableDisplay = True | |
chemin = os.path.expanduser('~/Veganisme') | |
balises=["<blockquote>",'<div class="xoopsQuote">',"<br />"] | |
accents=[["\\xc3\\xa7","ç"],["\\xc3\\xae","î"],["\\xc3\\x87","ç"],["\\xe2\\x80\\x99","'"],["\\xc3\\xa9","é"],["\\xc3\\xa0","à"],["\\xc3\\xa8","è"],["\\xc3\\xb4","ô"],["\\xc3\\xb9","ù"],["\\xc3\\xaa","ê"],["\xc3\xa7","ç"],["\xc3\x87","ç"],["\xe2\x80\x99","'"],["\xc3\xa9","é"],["\xc3\xa0","à"],["\xc3\xa8","è"],["\xc3\xb4","ô"],["\xc3\xb9","ù"],["\xc3\xaa","ê"],["\xc3\xae","î"]] | |
d = ['''<!DOCTYPE html> | |
<style> | |
table { | |
width:100%; | |
} | |
table, th, td { | |
border: 1px solid black; | |
border-collapse: collapse; | |
} | |
th, td { | |
padding: 15px; | |
text-align: left; | |
} | |
table#t01 tr:nth-child(even) { | |
background-color: #eee; | |
} | |
table#t01 tr:nth-child(odd) { | |
background-color: #fff; | |
} | |
</style> | |
<body> | |
<table id="t01"> | |
'''] | |
############### | |
pageTest=urllib.urlopen('https://www.koreus.com/modules/newbb/topic160787.html') | |
strpageTest=pageTest.read() | |
splitText = '''<b>(1)</b> <a href="/modules/newbb/topic160787-20.html">2</a> <a href="/modules/newbb/topic160787-40.html">3</a> <a href="/modules/newbb/topic160787-60.html">4</a> ... <a ''' | |
listSST = strpageTest.split(splitText) | |
listEST = listSST[1].split("</a>") | |
listNSST = listEST[0].split(">") | |
nbOfPages = int(listNSST[1]) | |
def createFPList(strpage): | |
#LISTE N° FORUMPOST | |
global listCom #Liste de forumpost en string | |
a = strpage.split('''<a id="f''') | |
for l in a: | |
if l.startswith('orumpost'): | |
comNb= l.split('">')[0] | |
listCom.append(comNb[8:]) | |
def getDicoFromStr(fileStr): | |
#OBTIENT DICO DEPUIS FICHIER SUJET.HTML EN STR UTF8 | |
global auteurList, commentList, dico, der, derTxt | |
dico = {} | |
list = fileStr.split("': ['") | |
auteurList = ["Olrik"] | |
commentList = [] | |
for line in list[1:]: | |
try: | |
try: | |
auteur = line[line.index("'], '")+5:] | |
if auteur not in auteurList: | |
auteurList.append(auteur) | |
except: | |
auteur = line[line.index("']}")+3:] | |
if auteur not in auteurList: | |
auteurList.append(auteur) | |
try: | |
commentList.append(line[:line.index("'], '")]) | |
except: | |
commentList.append(line[:line.index("']}")]) | |
except: | |
derTxt = "\n"+"Bug général" | |
texto.insert(END, derTxt) | |
id = 0 | |
for auteur in auteurList[:-1]: | |
list = commentList[id].split("', '") | |
pgGot = False | |
for comm in list: | |
if len(comm) == 0: | |
continue | |
try: | |
if not pgGot: | |
page = int(comm) | |
pgGot = True | |
elif pgGot: | |
fpCom = int(comm) | |
pgGot = False | |
try: | |
dico[auteur].extend([str(page),str(fpCom)]) | |
except: | |
dico[auteur]=[str(page),str(fpCom)] | |
except: | |
try : | |
dico[auteur].append(comm) | |
except: | |
dico[auteur]=comm | |
id+=1 | |
return dico | |
def rechercher(): | |
#ENREGISTRE LE SUJET/MET A JOUR L'ARCHIVAGE/RECHERCHE L'EXTRAIT DU TEXTE EN FONCTION DE L'AUTEUR | |
global listCom, entreeAuteur, listResult, dico, texto, derTxt, auteurSaved, accents, result, entree2 | |
lastPage=int(nbOfPages)*20 #LAST PAGE + (1*20) | |
#PREMIERE UTILISATION (PAS D'ARCHIVE SUR LE DISQUE DUR) | |
if not os.path.isfile(chemin+'\\Sujet.html'): | |
if not os.path.exists(chemin): | |
os.mkdir(chemin) | |
file = open(chemin+"\\Sujet.html","w") | |
file.close() | |
listCom = [] | |
page=urllib.urlopen('https://www.koreus.com/modules/newbb/topic160787.html') | |
strpage=page.read() | |
suf = range(0,9999, 20) | |
countPage = 0 | |
countComm = -1 | |
while suf[countPage]!=lastPage: | |
lastLenPage=len(strpage) | |
createFPList(strpage) | |
#DICO DES COMM/AUTEUR | |
b = strpage.split('href="/memb') | |
for c in b: | |
if c.startswith('re/'): | |
countComm += 1 | |
e = c.split('">') | |
i = e[1].split('</a>') | |
auteur = i[0] | |
f = c.split('<div class="comText') | |
f1 = f[1] | |
h = f1.split('</div>\r\n\t <br clear="all" />') #h[0] = ComText uniquement | |
h02 = h[0][2:] | |
##REMOVE IMG | |
while "<img" in h02: | |
idS = h02.index("<img") | |
idE = h02[idS:].index(">") | |
h02=h02.replace(h02[idS:idS+idE+1],"") | |
##REMOVE VID | |
h3 = "" | |
while "<iframe" in h02 : | |
idS = h02.index("<iframe") | |
idE = h02[idS+10:].index(">") | |
h02=h02.replace(h02[idS:idS+idE+11],"") | |
try: | |
dico[auteur].extend([str(suf[countPage]),listCom[countComm],h02]) | |
except: | |
dico[auteur]=[str(suf[countPage]),listCom[countComm],h02] | |
countPage+=1 | |
page=urllib.urlopen('https://www.koreus.com/modules/newbb/topic160787-'+str(suf[countPage])+'.html') | |
strpage=page.read() | |
pCurrent['value'] = countPage | |
#ARCHIVAGE | |
file = open(chemin+"\\Sujet.html","w") | |
file.write('<meta charset="UTF-8">') | |
file.write(str(dico)) | |
file.write("\n"+str(lastLenPage)) | |
file.close() | |
#ARCHIVAGE DEJA EXISTANT | |
else: | |
if len(dico) < 1: | |
file = open(chemin+"\\Sujet.html","r") | |
fileStr = file.read() | |
file.close() | |
dico = getDicoFromStr(fileStr) | |
#VERIFICATION DE LA MISE A JOUR | |
listCom = [] | |
suf = range(1960,9999, 20) | |
countPage = 0 | |
page=urllib.urlopen('https://www.koreus.com/modules/newbb/topic160787-'+str(suf[countPage])+'.html') | |
strpage=page.read() | |
news = True | |
file = open(chemin+"\\Sujet.html","r") | |
fileList = file.readlines() | |
file.close() | |
lastLenPage = int(fileList[-1]) | |
while suf[countPage]!=lastPage: | |
if len(strpage) == lastLenPage: | |
news = False | |
if len(strpage) != lastLenPage: | |
news = True | |
countPage+=1 | |
page=urllib.urlopen('https://www.koreus.com/modules/newbb/topic160787-'+str(suf[countPage])+'.html') | |
strpage=page.read() | |
if not news: | |
derTxt = "\n"+"BASE A JOUR!" | |
texto.insert(END, derTxt) | |
if news: | |
derTxt = "\n"+"MISE A JOUR DE LA BASE..." | |
texto.insert(END, derTxt) | |
page=urllib.urlopen('https://www.koreus.com/modules/newbb/topic160787-'+str(suf[countPage])+'.html') | |
strpage=page.read() | |
suf = range(1980,9999, 20) | |
countPage = 0 | |
countComm = -1 | |
while suf[countPage]!=lastPage: | |
createFPList(strpage) | |
lastLenPage=len(strpage) | |
#DICO DES COMM/AUTEUR | |
b = strpage.split('href="/memb') | |
for c in b: | |
if c.startswith('re/'): | |
countComm += 1 | |
e = c.split('">') | |
i = e[1].split('</a>') | |
auteur = i[0] | |
f = c.split('<div class="comText') | |
f1 = f[1] | |
h = f1.split('</div>\r\n\t <br clear="all" />') #h[0] = ComText uniquement | |
h02 = h[0][2:] | |
##REMOVE IMG | |
while "<img" in h02: | |
idS = h02.index("<img") | |
idE = h02[idS:].index(">") | |
h02=h02.replace(h02[idS:idS+idE+1],"") | |
##REMOVE VID | |
h3 = "" | |
while "<iframe" in h02 : | |
idS = h02.index("<iframe") | |
idE = h02[idS+10:].index(">") | |
h02=h02.replace(h02[idS:idS+idE+11],"") | |
if auteur in dico: | |
if listCom[countComm] not in dico[auteur]: | |
dico[auteur].extend([str(suf[countPage]),listCom[countComm],h02]) | |
derTxt = "\n"+"ECRITURE EN COURS" | |
texto.insert(END, derTxt) | |
else: | |
dico[auteur]=[str(suf[countPage]),listCom[countComm],h02] | |
countPage+=1 | |
page=urllib.urlopen('https://www.koreus.com/modules/newbb/topic160787-'+str(suf[countPage])+'.html') | |
strpage=page.read() | |
file = open(chemin+"\\Sujet.html","w") | |
file.write('<meta charset="UTF-8">') | |
file.write(str(dico)) | |
file.write("\n"+str(lastLenPage)) | |
file.close() | |
#RECHERCHER PAR AUTEUR L'EXTRAIT | |
try: | |
auteur = entreeAuteur.get() #Unicode si accent, string sinon | |
try: | |
auteur2 = unicodedata.normalize('NFKD', auteur).encode('ascii', 'ignore').lower() | |
except: | |
auteur2 = auteur.lower() | |
pCurrent['mode'] = "indeterminate" | |
pCurrent.start(1) | |
listResult = [] | |
result = 0 | |
for key in dico.keys(): | |
if auteur2 in key.lower() or auteur==u"*": | |
if auteur != u"*": | |
auteurSaved = key | |
else: | |
auteurSaved = "Auteur inconnu" | |
for comm in dico[key]: | |
try: | |
if int(comm)<200000: | |
page = comm | |
except: | |
None | |
try: | |
if int(comm)>200000: | |
num = comm | |
except: | |
None | |
try: | |
entree = entreeText.get() #Unicode si accent, string sinon | |
try: | |
entree2 = unicodedata.normalize('NFKD', entree).encode('ascii', 'ignore').lower() | |
except: | |
entree2 = entree.lower() | |
comm1 = comm | |
for a in accents: | |
comm1 = comm1.replace(a[0],a[1]) | |
comm2 = unicodedata.normalize('NFKD', comm1.decode('utf8').lower()).encode('ascii', 'ignore') | |
if entree2 in comm2 and len(comm.split())>1: | |
derTxt = "\n"+"PAGE:"+str(int(page)/20)+" FORUMPOST :"+str(num) | |
texto.insert(END, derTxt) | |
result += 1 | |
listResult.append([page,num,comm,key]) | |
except: | |
None | |
if auteur2 in key.lower(): | |
return | |
if auteur==u"*": | |
return | |
derTxt = "\n"+"AUTEUR NON TROUVVE" | |
texto.insert(END, derTxt) | |
pCurrent.stop() | |
except: | |
None | |
def check_thread(): | |
if thirdary_thread.is_alive(): | |
master.after(500, check_thread) | |
else: | |
derTxt = "\n"+"Nombre de resultats : "+str(result) | |
texto.insert(END, derTxt) | |
pCurrent.stop() | |
if result > 100: | |
if not tkMessageBox.askyesno("Continuer?", "Le script va ouvrir un nombre de page important, voulez vous continuer?"): | |
return | |
e = '''<meta charset="UTF-8"> | |
<!DOCTYPE html> | |
<html> | |
<head> | |
<style> | |
table { | |
width:100%; | |
} | |
table, th, td { | |
border: 1px solid black; | |
border-collapse: collapse; | |
} | |
th, td { | |
padding: 15px; | |
text-align: left; | |
} | |
table#t01 tr:nth-child(even) { | |
background-color: #e0dfe7; | |
} | |
table#t01 tr:nth-child(odd) { | |
background-color: #fff; | |
} | |
</style> | |
</head><table id="t01" cellpadding="3px" cellspacing="0px" rules="all" style="border:solid 1px black; border-collapse:collapse; text-align:center;"> ''' | |
pageWeb = open(chemin+"\\pageWeb.html","w") | |
pageWeb.write(e) | |
pageWeb.write('''<tr> | |
<th colspan="2" style="width:140px;background-color:#9a9ace"><FONT color="#fff">'''+auteurSaved+''' - Extrait du texte recherché : '''+entree2+''' - <U>Nombre de résultat(s)</U> : '''+str(result)+'''</FONT></th> | |
</tr>''') | |
for list in listResult: | |
txt = list[2] | |
for b in balises: | |
txt = txt.replace(b,"") | |
for a in accents: | |
txt = txt.replace(a[0],a[1]) | |
try: | |
id1=txt[:550][::-1].index('a<') | |
except: | |
id1=9999 | |
try: | |
id2=txt[:550][::-1].index('>a/') | |
except: | |
id2=9999 | |
if id2 > id1: | |
txt2 = txt[:550-id1-1] | |
else: | |
txt2 = txt[:550] | |
pageWeb.write('<tr><td>'+txt2+'<div align="right"><font face="verdana" color="orange" size="2">-<i><b>'+str(list[3])+'</b></i></font></div></td><td><a href="https://www.koreus.com/modules/newbb/topic160787-'+str(list[0])+'.html#forumpost'+str(list[1])+'"> Page '+str(int(list[0])/20)+' / ForumPost.'+str(list[1])+'</a></td></tr>') | |
pageWeb.close() | |
os.startfile("C:/Users/LeNa/Veganisme/pageWeb.html") | |
def init2(): | |
global thirdary_thread, derTxt, texto | |
derTxt="" | |
texto.delete(1.0, END) | |
pCurrent['mode'] = "determinate" | |
if len(entreeAuteur.get())==0 or len(entreeText.get())==0: | |
derTxt = "VEUILLEZ SAISIR UN NOM D'AUTEUR ET UN EXTRAIT DU TEXTE QU'IL AURAIT SAISI"\ | |
+"\nTAPEZ * POUR REMPLACER LE NOM DE L'AUTEUR SI VOUS NE SAVEZ PAS" | |
texto.insert(END, derTxt) | |
return | |
if len(entreeText.get())<5: | |
derTxt = "\n"+"L'EXTRAIT DE TEXTE EST TROP COURT (MIN 5 CARACTERES)" | |
texto.insert(END, derTxt) | |
return | |
thirdary_thread = threading.Thread(target=rechercher) | |
thirdary_thread.start() | |
master.after(50, check_thread) | |
t1 = Frame(master).pack() | |
auteur_label = Label(t1, text="Auteur :",bg='#535353',fg="white").pack(anchor="w") | |
entreeAuteur = Entry(t1, width=50) | |
entreeAuteur.pack(anchor="w") | |
t2 = Frame(master).pack() | |
text_label = Label(t2, text="Texte :",bg='#535353',fg="white").pack(anchor="w") | |
entreeText = Entry(t2, width=50) | |
entreeText.pack(anchor="w") | |
t3 = Frame(master).pack() | |
Rechercher = Button(t3, text ='Rechercher', command=init2).pack(fill=BOTH,padx=20,pady=10) | |
pCurrent = ttk.Progressbar(t3, orient='horizontal', mode='determinate', value=5, maximum=nbOfPages) | |
pCurrent.pack(fill=BOTH, pady=10) | |
scrollbar = Scrollbar(master) | |
scrollbar.pack(side=RIGHT, fill=Y) | |
texto = Text(master, wrap=WORD, yscrollcommand=scrollbar.set) | |
texto.pack() | |
scrollbar.config(command=texto.yview) | |
master.mainloop() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment