Skip to content

Instantly share code, notes, and snippets.

@LeNarvalo
Last active June 9, 2018 23:06
Show Gist options
  • Save LeNarvalo/b024ec6f4e23cc2500e180d2b3df0fc3 to your computer and use it in GitHub Desktop.
Save LeNarvalo/b024ec6f4e23cc2500e180d2b3df0fc3 to your computer and use it in GitHub Desktop.
# -*- coding: utf8 -*-
import urllib, webbrowser
import unicodedata
import threading
from Tkinter import *
import ttk
#import tkMessageBox
import os
import time
import re
#windowsAnswer = False
#def windowsAlert():
# global windowsAnswer
# windowsAnswer = tkMessageBox.askyesno("Continuer?", "Le script va ouvrir un nombre de page important, voulez vous continuer?")
master=Tk()
master.configure(background='#535353')
###VARIABLES###
global derTxt, texto, auteurSaved
auteurSaved = ""
derTxt = ""
displaySignature = False
displayVideo = False
displayImage = False
listResult = []
listCom = []
dico = {}
disableDisplay = True
chemin = os.path.expanduser('~/Veganisme')
balises=["<blockquote>",'<div class="xoopsQuote">',"<br />"]
accents=[["\xc3\xa7","ç"],["\xc3\x87","ç"],["\xe2\x80\x99","'"],["\xc3\xa9","é"],["\xc3\xa0","à"],["\xc3\xa8","è"],["\xc3\xb4","ô"],["\xc3\xb9","ù"],["\xc3\xaa","ê"],\
["\xc3\xae","î"],["\xc3\xaf","ï"]]
accents2=[["\\xc3\\xa7","\xc3\xa7"],["\\xc3\\xae","\xc3\xae"],["\\xc3\\x87","\xc3\x87"],["\\xe2\\x80\\x99","\xe2\x80\x99"],["\\xc3\\xa9","\xc3\xa9"],["\\xc3\\xa0","\xc3\xa0"],\
["\\xc3\\xa8","\xc3\xa8"],["\\xc3\\xb4","\xc3\xb4"],["\\xc3\\xb9","\xc3\xb9"],["\\xc3\\xaa","\xc3\xaa"],["\\xc3\\xaf","\xc3\xaf"]]
d = ['''<!DOCTYPE html>
<style>
table {
width:100%;
}
table, th, td {
border: 1px solid black;
border-collapse: collapse;
}
th, td {
padding: 15px;
text-align: left;
}
table#t01 tr:nth-child(even) {
background-color: #eee;
}
table#t01 tr:nth-child(odd) {
background-color: #fff;
}
</style>
<body>
<table id="t01">
''']
###############
pageTest=urllib.urlopen('https://www.koreus.com/modules/newbb/topic160787.html')
strpageTest=pageTest.read()
splitText = '''<b>(1)</b> <a href="/modules/newbb/topic160787-20.html">2</a> <a href="/modules/newbb/topic160787-40.html">3</a> <a href="/modules/newbb/topic160787-60.html">4</a> ... <a '''
listSST = strpageTest.split(splitText)
listEST = listSST[1].split("</a>")
listNSST = listEST[0].split(">")
nbOfPages = int(listNSST[1])
def createFPList(strpage):
#LISTE N° FORUMPOST
global listCom #Liste de forumpost en string
a = strpage.split('''<a id="f''')
for l in a:
if l.startswith('orumpost'):
comNb= l.split('">')[0]
listCom.append(comNb[8:])
def getDicoFromStr(fileStr):
#OBTIENT DICO DEPUIS FICHIER SUJET.HTML EN STR UTF8
global auteurList, commentList, dico, der, derTxt
dico = {}
list = fileStr.split("': ['")
auteurList = ["Olrik"]
commentList = []
for line in list[1:]:
try:
try:
auteur = line[line.index("'], '")+5:]
if auteur not in auteurList:
auteurList.append(auteur)
except:
auteur = line[line.index("']}")+3:]
if auteur not in auteurList:
auteurList.append(auteur)
try:
commentList.append(line[:line.index("'], '")])
except:
commentList.append(line[:line.index("']}")])
except:
derTxt = "\n"+"Bug général"
texto.insert(END, derTxt)
id = 0
for auteur in auteurList[:-1]:
list = commentList[id].split("', '")
pgGot = False
for comm in list:
if len(comm) == 0:
continue
try:
if not pgGot:
page = int(comm)
pgGot = True
elif pgGot:
fpCom = int(comm)
pgGot = False
try:
dico[auteur].extend([str(page),str(fpCom)])
except:
dico[auteur]=[str(page),str(fpCom)]
except:
for a in accents2:
comm = comm.replace(a[0],a[1])
try :
dico[auteur].append(comm)
except:
dico[auteur]=comm
id+=1
return dico
def rechercher():
#ENREGISTRE LE SUJET/MET A JOUR L'ARCHIVAGE/RECHERCHE L'EXTRAIT DU TEXTE EN FONCTION DE L'AUTEUR
global listCom, dico, texto, derTxt, research_thread
lastPage=int(nbOfPages)*20 #LAST PAGE + (1*20)
#PREMIERE UTILISATION (PAS D'ARCHIVE SUR LE DISQUE DUR)
if not os.path.isfile(chemin+'\\Sujet.html'):
if not os.path.exists(chemin):
os.mkdir(chemin)
file = open(chemin+"\\Sujet.html","w")
file.close()
listCom = []
page=urllib.urlopen('https://www.koreus.com/modules/newbb/topic160787.html')
strpage=page.read()
suf = range(0,9999, 20)
countPage = 0
countComm = -1
while suf[countPage]!=lastPage:
lastLenPage=len(strpage)
createFPList(strpage)
#DICO DES COMM/AUTEUR
b = strpage.split('href="/memb')
for c in b:
if c.startswith('re/'):
countComm += 1
e = c.split('">')
i = e[1].split('</a>')
auteur = i[0]
f = c.split('<div class="comText')
f1 = f[1]
h = f1.split('</div>\r\n\t <br clear="all" />') #h[0] = ComText uniquement
h02 = h[0][2:]
##REMOVE IMG
while "<img" in h02:
idS = h02.index("<img")
idE = h02[idS:].index(">")
h02=h02.replace(h02[idS:idS+idE+1],"")
##REMOVE VID
h3 = ""
while "<iframe" in h02 :
idS = h02.index("<iframe")
idE = h02[idS+10:].index(">")
h02=h02.replace(h02[idS:idS+idE+11],"")
try:
dico[auteur].extend([str(suf[countPage]),listCom[countComm],h02])
except:
dico[auteur]=[str(suf[countPage]),listCom[countComm],h02]
countPage+=1
page=urllib.urlopen('https://www.koreus.com/modules/newbb/topic160787-'+str(suf[countPage])+'.html')
strpage=page.read()
pCurrent['value'] = countPage
#ARCHIVAGE
file = open(chemin+"\\Sujet.html","w")
file.write('<meta charset="UTF-8">')
file.write(str(dico))
file.write("\n"+str(lastLenPage))
file.close()
#ARCHIVAGE DEJA EXISTANT
else:
if len(dico) < 1:
file = open(chemin+"\\Sujet.html","r")
fileStr = file.read()
file.close()
dico = getDicoFromStr(fileStr)
#VERIFICATION DE LA MISE A JOUR
listCom = []
suf = range(1960,9999, 20)
countPage = 0
page=urllib.urlopen('https://www.koreus.com/modules/newbb/topic160787-'+str(suf[countPage])+'.html')
strpage=page.read()
news = True
file = open(chemin+"\\Sujet.html","r")
fileList = file.readlines()
file.close()
lastLenPage = int(fileList[-1])
while suf[countPage]!=lastPage:
if len(strpage) == lastLenPage:
news = False
if len(strpage) != lastLenPage:
news = True
countPage+=1
page=urllib.urlopen('https://www.koreus.com/modules/newbb/topic160787-'+str(suf[countPage])+'.html')
strpage=page.read()
if not news:
derTxt = "\n"+"BASE A JOUR!"
texto.insert(END, derTxt)
if news:
derTxt = "\n"+"MISE A JOUR DE LA BASE..."
texto.insert(END, derTxt)
page=urllib.urlopen('https://www.koreus.com/modules/newbb/topic160787-'+str(suf[countPage])+'.html')
strpage=page.read()
suf = range(1980,9999, 20)
countPage = 0
countComm = -1
while suf[countPage]!=lastPage:
createFPList(strpage)
lastLenPage=len(strpage)
#DICO DES COMM/AUTEUR
b = strpage.split('href="/memb')
for c in b:
if c.startswith('re/'):
countComm += 1
e = c.split('">')
i = e[1].split('</a>')
auteur = i[0]
f = c.split('<div class="comText')
f1 = f[1]
h = f1.split('</div>\r\n\t <br clear="all" />') #h[0] = ComText uniquement
h02 = h[0][2:]
##REMOVE IMG
while "<img" in h02:
idS = h02.index("<img")
idE = h02[idS:].index(">")
h02=h02.replace(h02[idS:idS+idE+1],"")
##REMOVE VID
h3 = ""
while "<iframe" in h02 :
idS = h02.index("<iframe")
idE = h02[idS+10:].index(">")
h02=h02.replace(h02[idS:idS+idE+11],"")
if auteur in dico:
if listCom[countComm] not in dico[auteur]:
dico[auteur].extend([str(suf[countPage]),listCom[countComm],h02])
derTxt = "\n"+"ECRITURE EN COURS"
texto.insert(END, derTxt)
else:
dico[auteur]=[str(suf[countPage]),listCom[countComm],h02]
countPage+=1
page=urllib.urlopen('https://www.koreus.com/modules/newbb/topic160787-'+str(suf[countPage])+'.html')
strpage=page.read()
file = open(chemin+"\\Sujet.html","w")
file.write('<meta charset="UTF-8">')
file.write(str(dico))
file.write("\n"+str(lastLenPage))
file.close()
research_thread = threading.Thread(target=research)
research_thread.daemon = True
research_thread.start()
def research():
global listResult, dico, texto, derTxt, auteurSaved, result, entree2, auteur2
#RECHERCHER PAR AUTEUR L'EXTRAIT
try:
derTxt = "\n"+"RECHERCHE EN COURS..."
texto.insert(END, derTxt)
pCurrent['mode'] = "indeterminate"
pCurrent.start(1)
listResult = []
result = 0
for key in dico.keys():
if auteur2 in key.lower() or auteur==u"*":
if auteur != u"*":
auteurSaved = key
else:
auteurSaved = "Auteur inconnu"
for comm in dico[key]:
try:
if int(comm)<200000:
page = comm
except:
None
try:
if int(comm)>200000:
num = comm
except:
None
try:
comm1 = comm
for a in accents:
comm1 = comm1.replace(a[0],a[1])
if not casse.get():
try:
comm2 = unicodedata.normalize('NFKD', comm1.decode('utf8').lower()).encode('ascii', 'ignore')
except:
comm2 = comm1.lower() #Inutile normalement
else:
comm2 = comm1
if entree2.encode('utf8') in comm2 and len(comm.split())>1 :
if result < 200:
derTxt = "\n"+"PAGE:"+str(int(page)/20)+" FORUMPOST :"+str(num)
texto.insert(END, derTxt)
result += 1
listResult.append([page,num,comm,key])
#if result % 80 == 0:
# derTxt = "\n"+"PAUSE REQUIERED"
# texto.insert(END, derTxt)
# time.sleep(1)
except:
None
if auteur2 in key.lower():
break
buildPageWeb_thread = threading.Thread(target=buildPageWeb)
buildPageWeb_thread.daemon = True
buildPageWeb_thread.start()
if auteur==u"*":
return
derTxt = "\n"+"AUTEUR NON TROUVVE"
texto.insert(END, derTxt)
pCurrent.stop()
except:
None
def buildPageWeb(afficherPlusDe200=False):
global texto, derTxt, entree2
#if research_thread.isAlive():
# time.sleep(2)
# buildPageWeb()
pCurrent.stop()
if result >= 200 and not afficherPlusDe200:
derTxt=""
texto.delete(1.0, END)
derTxt = "\n"+"Nombre de resultats : >200"
texto.insert(END, derTxt)
Afficher['state']='normal'
return
if result < 200:
derTxt = "\n"+"Nombre de resultats : "+str(result)
texto.insert(END, derTxt)
e = '''<meta charset="UTF-8">
<!DOCTYPE html>
<html>
<head>
<style>
table {
width:100%;
}
table, th, td {
border: 1px solid black;
border-collapse: collapse;
}
th, td {
padding: 15px;
text-align: left;
}
table#t01 tr:nth-child(even) {
background-color: #e0dfe7;
}
table#t01 tr:nth-child(odd) {
background-color: #fff;
}
</style>
</head><table id="t01" cellpadding="3px" cellspacing="0px" rules="all" style="border:solid 1px black; border-collapse:collapse; text-align:center;"> '''
if casse.get():
entree2 = entree2.encode('utf8')
pageWeb = open(chemin+"\\pageWeb.html","w")
pageWeb.write(e)
pageWeb.write('''<tr>
<th colspan="2" style="width:140px;background-color:#9a9ace"><FONT color="#fff">'''+auteurSaved+''' - Extrait du texte recherché : '''+entree2+''' - <U>Nombre de résultat(s)</U> : '''+str(result)+'''</FONT></th>
</tr>''')
for list in listResult:
txt = list[2]
for b in balises:
txt = txt.replace(b,"")
for a in accents:
txt = txt.replace(a[0],a[1])
try:
id1=txt[:550][::-1].index('a<')
except:
id1=9999
try:
id2=txt[:550][::-1].index('>a/')
except:
id2=9999
if id2 > id1:
txt2 = txt[:550-id1-1]
else:
txt2 = txt[:550]
pageWeb.write('<tr><td>'+txt2+'<div align="right"><font face="verdana" color="orange" size="2">-<i><b>'+str(list[3])+'</b></i></font></div></td><td><a href="https://www.koreus.com/modules/newbb/topic160787-'+str(list[0])+'.html#forumpost'+str(list[1])+'"> Page '+str(int(list[0])/20)+' / ForumPost.'+str(list[1])+'</a></td></tr>')
pageWeb.close()
displayPageWeb = threading.Thread(target=displaySearch)
displayPageWeb.daemon = True
displayPageWeb.start()
def displaySearch():
os.startfile("C:/Users/LeNa/Veganisme/pageWeb.html")
Afficher['state']='disabled'
def init2():
global search_thread, derTxt, texto, entree2, auteur2, auteur
derTxt=""
texto.delete(1.0, END)
pCurrent['mode'] = "determinate"
Afficher['state']='disabled'
if len(entreeAuteur.get())==0 or len(entreeText.get())==0:
derTxt = "VEUILLEZ SAISIR UN NOM D'AUTEUR ET UN EXTRAIT DU TEXTE QU'IL AURAIT SAISI"\
+"\nTAPEZ * POUR REMPLACER LE NOM DE L'AUTEUR SI VOUS NE SAVEZ PAS"
texto.insert(END, derTxt)
return
if len(entreeText.get())<5:
derTxt = "\n"+"L'EXTRAIT DE TEXTE EST TROP COURT (MIN 5 CARACTERES)"
texto.insert(END, derTxt)
return
auteur = entreeAuteur.get() #Unicode si accent, string sinon
try:
auteur2 = unicodedata.normalize('NFKD', auteur).encode('ascii', 'ignore').lower()
except:
auteur2 = auteur.lower()
entree = entreeText.get() #Unicode si accent, string sinon
if not casse.get():
try:
entree2 = unicodedata.normalize('NFKD', entree).encode('ascii', 'ignore').lower()
except:
entree2 = entree.lower()
else:
entree2 = entree
search_thread = threading.Thread(target=rechercher)
search_thread.start()
def afficherPlusDe200():
buildPageWeb(True)
def fenetre():
global Afficher, texto, pCurrent, entreeAuteur, entreeText, Rechercher, scrollbar, casse
t1 = Frame(master).grid(row=0)
auteur_label = Label(t1, text="Auteur :",bg='#535353',fg="white")#.pack()
auteur_label.grid(row=0,column=0, sticky=W)
entreeAuteur = Entry(t1, width=50)
entreeAuteur.grid(row=0,column=1, columnspan=3, sticky=W+E)
t2 = Frame(master).grid(row=1)
text_label = Label(t2, text="Texte :",bg='#535353',fg="white")#.pack()
text_label.grid(row=1,column=0, sticky=W)
entreeText = Entry(t2, width=50)
entreeText.grid(row=1,column=1, columnspan=3, sticky=W+E)
t3 = Frame(master).grid(row=2,pady=20)
casse = IntVar()
bouton=Checkbutton(t3, text="Casse", variable=casse,bg='#535353',activebackground='#535353',fg="#FFFFFF",selectcolor="black")
bouton.grid(row=2)
Rechercher = Button(t3, text ='Rechercher', command=init2,width=20)#.pack(fill=BOTH,padx=20,pady=10)
Rechercher.grid(row=2,column=1, sticky=E)
Afficher = Button(master,width=20, compound=LEFT, overrelief=GROOVE, text ='Afficher > 200 résultats', fg="red", command=afficherPlusDe200, state=DISABLED)
Afficher.grid(row=2,column=2, sticky=W)
t4 = Frame(master).grid(row=3)
pCurrent = ttk.Progressbar(t4, orient='horizontal', mode='determinate', value=5, maximum=nbOfPages)
pCurrent.grid(row=3, column=0,columnspan=4, sticky=W+E)#.pack(fill=BOTH, pady=10)
t5 = Frame(master).grid(row=4)
scrollbar = Scrollbar(t5)
scrollbar.grid(row=4, column=3,sticky=W+S+N)#.pack(side=RIGHT, fill=Y)
texto = Text(t5, wrap=WORD, yscrollcommand=scrollbar.set, width=50)
texto.grid(row=4,column=0, columnspan=3, sticky=W+E)#.pack(side=LEFT)
scrollbar.config(command=texto.yview)
fenetre()
master.mainloop()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment