Skip to content

Instantly share code, notes, and snippets.

@LeNarvalo
Created May 29, 2018 00:09
Show Gist options
  • Save LeNarvalo/6bc6cb673182fdab8a891f51cd617b2b to your computer and use it in GitHub Desktop.
Save LeNarvalo/6bc6cb673182fdab8a891f51cd617b2b to your computer and use it in GitHub Desktop.
# -*- coding: utf8 -*-
import urllib, webbrowser
import unicodedata
import threading
from Tkinter import *
import ttk
import tkMessageBox
import os
import time
import re
master=Tk()
master.configure(background='#535353')
###VARIABLES###
global derTxt, texto, auteurSaved
auteurSaved = ""
derTxt = ""
displaySignature = False
displayVideo = False
displayImage = False
listResult = []
listCom = []
dico = {}
disableDisplay = True
chemin = os.path.expanduser('~/Veganisme')
balises=["<blockquote>",'<div class="xoopsQuote">',"<br />"]
accents=[["\\xc3\\xa7","ç"],["\\xc3\\xae","î"],["\\xc3\\x87","ç"],["\\xe2\\x80\\x99","'"],["\\xc3\\xa9","é"],["\\xc3\\xa0","à"],["\\xc3\\xa8","è"],["\\xc3\\xb4","ô"],["\\xc3\\xb9","ù"],["\\xc3\\xaa","ê"],["\xc3\xa7","ç"],["\xc3\x87","ç"],["\xe2\x80\x99","'"],["\xc3\xa9","é"],["\xc3\xa0","à"],["\xc3\xa8","è"],["\xc3\xb4","ô"],["\xc3\xb9","ù"],["\xc3\xaa","ê"],["\xc3\xae","î"]]
d = ['''<!DOCTYPE html>
<style>
table {
width:100%;
}
table, th, td {
border: 1px solid black;
border-collapse: collapse;
}
th, td {
padding: 15px;
text-align: left;
}
table#t01 tr:nth-child(even) {
background-color: #eee;
}
table#t01 tr:nth-child(odd) {
background-color: #fff;
}
</style>
<body>
<table id="t01">
''']
###############
pageTest=urllib.urlopen('https://www.koreus.com/modules/newbb/topic160787.html')
strpageTest=pageTest.read()
splitText = '''<b>(1)</b> <a href="/modules/newbb/topic160787-20.html">2</a> <a href="/modules/newbb/topic160787-40.html">3</a> <a href="/modules/newbb/topic160787-60.html">4</a> ... <a '''
listSST = strpageTest.split(splitText)
listEST = listSST[1].split("</a>")
listNSST = listEST[0].split(">")
nbOfPages = int(listNSST[1])
def createFPList(strpage):
#LISTE N° FORUMPOST
global listCom #Liste de forumpost en string
a = strpage.split('''<a id="f''')
for l in a:
if l.startswith('orumpost'):
comNb= l.split('">')[0]
listCom.append(comNb[8:])
def getDicoFromStr(fileStr):
#OBTIENT DICO DEPUIS FICHIER SUJET.HTML EN STR UTF8
global auteurList, commentList, dico, der, derTxt
dico = {}
list = fileStr.split("': ['")
auteurList = ["Olrik"]
commentList = []
for line in list[1:]:
try:
try:
auteur = line[line.index("'], '")+5:]
if auteur not in auteurList:
auteurList.append(auteur)
except:
auteur = line[line.index("']}")+3:]
if auteur not in auteurList:
auteurList.append(auteur)
try:
commentList.append(line[:line.index("'], '")])
except:
commentList.append(line[:line.index("']}")])
except:
derTxt = "\n"+"Bug général"
texto.insert(END, derTxt)
id = 0
for auteur in auteurList[:-1]:
list = commentList[id].split("', '")
pgGot = False
for comm in list:
if len(comm) == 0:
continue
try:
if not pgGot:
page = int(comm)
pgGot = True
elif pgGot:
fpCom = int(comm)
pgGot = False
try:
dico[auteur].extend([str(page),str(fpCom)])
except:
dico[auteur]=[str(page),str(fpCom)]
except:
try :
dico[auteur].append(comm)
except:
dico[auteur]=comm
id+=1
return dico
def rechercher():
#ENREGISTRE LE SUJET/MET A JOUR L'ARCHIVAGE/RECHERCHE L'EXTRAIT DU TEXTE EN FONCTION DE L'AUTEUR
global listCom, entreeAuteur, listResult, dico, texto, derTxt, auteurSaved, accents, result, entree2
lastPage=int(nbOfPages)*20 #LAST PAGE + (1*20)
#PREMIERE UTILISATION (PAS D'ARCHIVE SUR LE DISQUE DUR)
if not os.path.isfile(chemin+'\\Sujet.html'):
if not os.path.exists(chemin):
os.mkdir(chemin)
file = open(chemin+"\\Sujet.html","w")
file.close()
listCom = []
page=urllib.urlopen('https://www.koreus.com/modules/newbb/topic160787.html')
strpage=page.read()
suf = range(0,9999, 20)
countPage = 0
countComm = -1
while suf[countPage]!=lastPage:
lastLenPage=len(strpage)
createFPList(strpage)
#DICO DES COMM/AUTEUR
b = strpage.split('href="/memb')
for c in b:
if c.startswith('re/'):
countComm += 1
e = c.split('">')
i = e[1].split('</a>')
auteur = i[0]
f = c.split('<div class="comText')
f1 = f[1]
h = f1.split('</div>\r\n\t <br clear="all" />') #h[0] = ComText uniquement
h02 = h[0][2:]
##REMOVE IMG
while "<img" in h02:
idS = h02.index("<img")
idE = h02[idS:].index(">")
h02=h02.replace(h02[idS:idS+idE+1],"")
##REMOVE VID
h3 = ""
while "<iframe" in h02 :
idS = h02.index("<iframe")
idE = h02[idS+10:].index(">")
h02=h02.replace(h02[idS:idS+idE+11],"")
try:
dico[auteur].extend([str(suf[countPage]),listCom[countComm],h02])
except:
dico[auteur]=[str(suf[countPage]),listCom[countComm],h02]
countPage+=1
page=urllib.urlopen('https://www.koreus.com/modules/newbb/topic160787-'+str(suf[countPage])+'.html')
strpage=page.read()
pCurrent['value'] = countPage
#ARCHIVAGE
file = open(chemin+"\\Sujet.html","w")
file.write('<meta charset="UTF-8">')
file.write(str(dico))
file.write("\n"+str(lastLenPage))
file.close()
#ARCHIVAGE DEJA EXISTANT
else:
if len(dico) < 1:
file = open(chemin+"\\Sujet.html","r")
fileStr = file.read()
file.close()
dico = getDicoFromStr(fileStr)
#VERIFICATION DE LA MISE A JOUR
listCom = []
suf = range(1960,9999, 20)
countPage = 0
page=urllib.urlopen('https://www.koreus.com/modules/newbb/topic160787-'+str(suf[countPage])+'.html')
strpage=page.read()
news = True
file = open(chemin+"\\Sujet.html","r")
fileList = file.readlines()
file.close()
lastLenPage = int(fileList[-1])
while suf[countPage]!=lastPage:
if len(strpage) == lastLenPage:
news = False
if len(strpage) != lastLenPage:
news = True
countPage+=1
page=urllib.urlopen('https://www.koreus.com/modules/newbb/topic160787-'+str(suf[countPage])+'.html')
strpage=page.read()
if not news:
derTxt = "\n"+"BASE A JOUR!"
texto.insert(END, derTxt)
if news:
derTxt = "\n"+"MISE A JOUR DE LA BASE..."
texto.insert(END, derTxt)
page=urllib.urlopen('https://www.koreus.com/modules/newbb/topic160787-'+str(suf[countPage])+'.html')
strpage=page.read()
suf = range(1980,9999, 20)
countPage = 0
countComm = -1
while suf[countPage]!=lastPage:
createFPList(strpage)
lastLenPage=len(strpage)
#DICO DES COMM/AUTEUR
b = strpage.split('href="/memb')
for c in b:
if c.startswith('re/'):
countComm += 1
e = c.split('">')
i = e[1].split('</a>')
auteur = i[0]
f = c.split('<div class="comText')
f1 = f[1]
h = f1.split('</div>\r\n\t <br clear="all" />') #h[0] = ComText uniquement
h02 = h[0][2:]
##REMOVE IMG
while "<img" in h02:
idS = h02.index("<img")
idE = h02[idS:].index(">")
h02=h02.replace(h02[idS:idS+idE+1],"")
##REMOVE VID
h3 = ""
while "<iframe" in h02 :
idS = h02.index("<iframe")
idE = h02[idS+10:].index(">")
h02=h02.replace(h02[idS:idS+idE+11],"")
if auteur in dico:
if listCom[countComm] not in dico[auteur]:
dico[auteur].extend([str(suf[countPage]),listCom[countComm],h02])
derTxt = "\n"+"ECRITURE EN COURS"
texto.insert(END, derTxt)
else:
dico[auteur]=[str(suf[countPage]),listCom[countComm],h02]
countPage+=1
page=urllib.urlopen('https://www.koreus.com/modules/newbb/topic160787-'+str(suf[countPage])+'.html')
strpage=page.read()
file = open(chemin+"\\Sujet.html","w")
file.write('<meta charset="UTF-8">')
file.write(str(dico))
file.write("\n"+str(lastLenPage))
file.close()
#RECHERCHER PAR AUTEUR L'EXTRAIT
try:
auteur = entreeAuteur.get() #Unicode si accent, string sinon
try:
auteur2 = unicodedata.normalize('NFKD', auteur).encode('ascii', 'ignore').lower()
except:
auteur2 = auteur.lower()
pCurrent['mode'] = "indeterminate"
pCurrent.start(1)
listResult = []
result = 0
for key in dico.keys():
if auteur2 in key.lower() or auteur==u"*":
if auteur != u"*":
auteurSaved = key
else:
auteurSaved = "Auteur inconnu"
for comm in dico[key]:
try:
if int(comm)<200000:
page = comm
except:
None
try:
if int(comm)>200000:
num = comm
except:
None
try:
entree = entreeText.get() #Unicode si accent, string sinon
try:
entree2 = unicodedata.normalize('NFKD', entree).encode('ascii', 'ignore').lower()
except:
entree2 = entree.lower()
comm1 = comm
for a in accents:
comm1 = comm1.replace(a[0],a[1])
comm2 = unicodedata.normalize('NFKD', comm1.decode('utf8').lower()).encode('ascii', 'ignore')
if entree2 in comm2 and len(comm.split())>1:
derTxt = "\n"+"PAGE:"+str(int(page)/20)+" FORUMPOST :"+str(num)
texto.insert(END, derTxt)
result += 1
listResult.append([page,num,comm,key])
except:
None
if auteur2 in key.lower():
return
if auteur==u"*":
return
derTxt = "\n"+"AUTEUR NON TROUVVE"
texto.insert(END, derTxt)
pCurrent.stop()
except:
None
def check_thread():
if thirdary_thread.is_alive():
master.after(500, check_thread)
else:
derTxt = "\n"+"Nombre de resultats : "+str(result)
texto.insert(END, derTxt)
pCurrent.stop()
if result > 100:
if not tkMessageBox.askyesno("Continuer?", "Le script va ouvrir un nombre de page important, voulez vous continuer?"):
return
e = '''<meta charset="UTF-8">
<!DOCTYPE html>
<html>
<head>
<style>
table {
width:100%;
}
table, th, td {
border: 1px solid black;
border-collapse: collapse;
}
th, td {
padding: 15px;
text-align: left;
}
table#t01 tr:nth-child(even) {
background-color: #e0dfe7;
}
table#t01 tr:nth-child(odd) {
background-color: #fff;
}
</style>
</head><table id="t01" cellpadding="3px" cellspacing="0px" rules="all" style="border:solid 1px black; border-collapse:collapse; text-align:center;"> '''
pageWeb = open(chemin+"\\pageWeb.html","w")
pageWeb.write(e)
pageWeb.write('''<tr>
<th colspan="2" style="width:140px;background-color:#9a9ace"><FONT color="#fff">'''+auteurSaved+''' - Extrait du texte recherché : '''+entree2+''' - <U>Nombre de résultat(s)</U> : '''+str(result)+'''</FONT></th>
</tr>''')
for list in listResult:
txt = list[2]
for b in balises:
txt = txt.replace(b,"")
for a in accents:
txt = txt.replace(a[0],a[1])
try:
id1=txt[:550][::-1].index('a<')
except:
id1=9999
try:
id2=txt[:550][::-1].index('>a/')
except:
id2=9999
if id2 > id1:
txt2 = txt[:550-id1-1]
else:
txt2 = txt[:550]
pageWeb.write('<tr><td>'+txt2+'<div align="right"><font face="verdana" color="orange" size="2">-<i><b>'+str(list[3])+'</b></i></font></div></td><td><a href="https://www.koreus.com/modules/newbb/topic160787-'+str(list[0])+'.html#forumpost'+str(list[1])+'"> Page '+str(int(list[0])/20)+' / ForumPost.'+str(list[1])+'</a></td></tr>')
pageWeb.close()
os.startfile("C:/Users/LeNa/Veganisme/pageWeb.html")
def init2():
global thirdary_thread, derTxt, texto
derTxt=""
texto.delete(1.0, END)
pCurrent['mode'] = "determinate"
if len(entreeAuteur.get())==0 or len(entreeText.get())==0:
derTxt = "VEUILLEZ SAISIR UN NOM D'AUTEUR ET UN EXTRAIT DU TEXTE QU'IL AURAIT SAISI"\
+"\nTAPEZ * POUR REMPLACER LE NOM DE L'AUTEUR SI VOUS NE SAVEZ PAS"
texto.insert(END, derTxt)
return
if len(entreeText.get())<5:
derTxt = "\n"+"L'EXTRAIT DE TEXTE EST TROP COURT (MIN 5 CARACTERES)"
texto.insert(END, derTxt)
return
thirdary_thread = threading.Thread(target=rechercher)
thirdary_thread.start()
master.after(50, check_thread)
t1 = Frame(master).pack()
auteur_label = Label(t1, text="Auteur :",bg='#535353',fg="white").pack(anchor="w")
entreeAuteur = Entry(t1, width=50)
entreeAuteur.pack(anchor="w")
t2 = Frame(master).pack()
text_label = Label(t2, text="Texte :",bg='#535353',fg="white").pack(anchor="w")
entreeText = Entry(t2, width=50)
entreeText.pack(anchor="w")
t3 = Frame(master).pack()
Rechercher = Button(t3, text ='Rechercher', command=init2).pack(fill=BOTH,padx=20,pady=10)
pCurrent = ttk.Progressbar(t3, orient='horizontal', mode='determinate', value=5, maximum=nbOfPages)
pCurrent.pack(fill=BOTH, pady=10)
scrollbar = Scrollbar(master)
scrollbar.pack(side=RIGHT, fill=Y)
texto = Text(master, wrap=WORD, yscrollcommand=scrollbar.set)
texto.pack()
scrollbar.config(command=texto.yview)
master.mainloop()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment