Skip to content

Instantly share code, notes, and snippets.

@initbrain
Last active December 14, 2018 09:59
Show Gist options
  • Save initbrain/6864902 to your computer and use it in GitHub Desktop.
Save initbrain/6864902 to your computer and use it in GitHub Desktop.
RIPE Database - Full Text Search Scraper (using QtWebKit.QWebView from PySide)
#!/usr/bin/python
# -*- coding: utf-8 -*-
import sys
import re
from PySide import QtCore, QtGui, QtWebKit
class Browser(QtGui.QMainWindow):
def __init__(self):
"""
Initialize the browser GUI and connect the events
"""
super(Browser, self).__init__()
self.initUI()
def initUI(self):
self.resize(800,600)
self.center()
self.setWindowTitle('RIPE Database - Full Text Search Scraper')
# self.setWindowIcon(QtGui.QIcon('icone.png')) # TODO
self.centralWidget = QtGui.QWidget(self)
self.mainLayout = QtGui.QHBoxLayout(self.centralWidget)
self.mainLayout.setSpacing(0)
self.mainLayout.setContentsMargins(10, 10, 10, 10) # setMargin(10) for PyQT4
self.frame = QtGui.QFrame(self.centralWidget)
self.gridLayout = QtGui.QVBoxLayout(self.frame)
self.gridLayout.setContentsMargins(0, 0, 0, 0)
self.gridLayout.setSpacing(10)
self.horizontalLayout = QtGui.QHBoxLayout()
self.tb_url = QtGui.QLineEdit(self.frame)
# self.bt_back = QtGui.QPushButton(self.frame)
# self.bt_ahead = QtGui.QPushButton(self.frame)
# self.bt_stop = QtGui.QPushButton(self.frame)
self.tb_go = QtGui.QPushButton(self.frame)
# self.bt_back.setText("<") # setIcon(QtGui.QIcon().fromTheme("go-previous")) for Icon
# self.bt_ahead.setText(">")
# self.bt_stop.setText("x")
self.tb_go.setText("Search")
self.tb_url.setText("Type your search here...")
self.tb_url.setToolTip('RIPE Database text search')
# self.horizontalLayout.addWidget(self.bt_back)
# self.horizontalLayout.addWidget(self.bt_ahead)
# self.horizontalLayout.addWidget(self.bt_stop)
self.horizontalLayout.addWidget(self.tb_url)
self.horizontalLayout.addWidget(self.tb_go)
self.gridLayout.addLayout(self.horizontalLayout)
self.html = QtWebKit.QWebView()
self.gridLayout.addWidget(self.html)
self.mainLayout.addWidget(self.frame)
self.setCentralWidget(self.centralWidget)
# self.connect(self.bt_back, QtCore.SIGNAL("clicked()"), self.html.back)
# self.connect(self.bt_ahead, QtCore.SIGNAL("clicked()"), self.html.forward)
# self.connect(self.bt_stop, QtCore.SIGNAL("clicked()"), self.html.stop)
self.connect(self.tb_url, QtCore.SIGNAL("returnPressed()"), self.browse)
self.connect(self.tb_go, QtCore.SIGNAL("clicked()"), self.browse)
self.connect(self.html, QtCore.SIGNAL('loadFinished(bool)'), self.loadFinished)
self.connect(self.html, QtCore.SIGNAL('loadStarted()'), self.loadStarted)
self.ripe_url = "https://apps.db.ripe.net/search/full-text.html"
self.currentPage = 0
self.result = []
self.eraseList = ['(\/b)',
'<b>',
'</b>',
'\n']
self.replaceDict = {'&amp;': '&',
' ': '%20'}
# Status bar
self.statusBar().showMessage('Ready')
self.statusProgressBar = QtGui.QProgressBar()
self.statusBar().addPermanentWidget(self.statusProgressBar)
self.statusProgressBar.setGeometry(30, 40, 200, 25)
self.statusProgressBar.setRange(0, 0)
self.statusProgressBar.setVisible(False)
# self.browse() # TODO debug
def center(self):
qr = self.frameGeometry()
cp = QtGui.QDesktopWidget().availableGeometry().center()
qr.moveCenter(cp)
self.move(qr.topLeft())
def browse(self):
"""
Make a web browse on a specific url and show the page on the
Webview widget.
"""
ripe_url = QtCore.QUrl(self.ripe_url)
self.html.load(ripe_url)
self.html.show()
def loadStarted(self):
self.statusProgressBar.setVisible(True)
if not self.currentPage:
self.statusBar().showMessage('Search started : %s' % self.tb_url.text())
self.tb_go.setEnabled(False)
self.tb_url.setEnabled(False)
else:
self.statusBar().showMessage('Search : %s (parsing page %d)' % (self.tb_url.text(), self.currentPage))
def loadFinished(self, ok):
self.statusProgressBar.setVisible(False)
# If page pointer is not set, launch search
if not self.currentPage:
# Using JavaScript to submit search
self.html.page().currentFrame().evaluateJavaScript('document.getElementById("home_search:searchform_q").value = "%s";' % self.tb_url.text())
self.html.page().currentFrame().evaluateJavaScript('document.getElementById("home_search:doSearch").click();')
print "[+] Query initiated"
self.result = []
self.currentPage += 1
else:
# Search in progress, source code parsing
sourceCode = "%s" % self.html.page().currentFrame().toHtml()
regex_part_a = '<a href="([\S ]+?)" target="_blank" style="color:blue">'
regex_part_b = '(.+?)\s+?<br>\s+?</a>\s+?'
regex_part_c = '<small><small>([\S\s]+?)</small></small>'
regex = regex_part_a + regex_part_b + regex_part_c
# Debug
#file = open('page%d.html' % self.currentPage, 'w')
#file.write(sourceCode.encode('utf-8'))
#file.close()
result = re.compile(regex, re.MULTILINE).findall(sourceCode)
print "[+] Parsing page %d (%d bytes), %d result%s" % (self.currentPage,
self.html.page().bytesReceived(),
len(self.result) + len(result),
's' if len(self.result) + len(result) > 1 else '')
# if len(result) != 10:
# print "#"*50
# print sourceCode
# print "#"*50
# Noise removal
for link, res, found in result:
for pattern in self.eraseList:
link = link.replace(pattern, '')
res = res.replace(pattern, '')
found = found.replace(pattern, '')
for pattern in self.replaceDict.keys():
link = link.replace(pattern, self.replaceDict[pattern])
res = re.sub('\s+', ' ', res)
found = re.sub('\s+', ' ', found)
# print "%s, %s, %s" % (link, res, found)
self.result.append("%s, %s, %s" % (found, res, link))
# Use JavaScript to go to the next page if link is present
if 'resultsView:paginationViewTop:paginationForm:main:after:repeat:0:byIndex' in sourceCode:
self.html.page().currentFrame().evaluateJavaScript('document.getElementById("resultsView:paginationViewTop:paginationForm:main:after:repeat:0:byIndex").click();')
self.currentPage += 1
else:
# If no next page, parsing is finished
print "[+] Research completed (%d result%s)" % (len(self.result),
's' if len(self.result) > 1 else '')
print '\n'.join(self.result)
self.showResult()
self.statusBar().showMessage('Research completed : %s (%s result%s)' % (self.tb_url.text(),
len(self.result),
's' if len(self.result) > 1 else ''))
self.currentPage = 0
self.tb_go.setEnabled(True)
self.tb_url.setEnabled(True)
def showResult(self):
resultDialog = QtGui.QDialog(self)
# resultDialog.setModal(True)
resultDialog.resize(600, 400)
resultDialog.setWindowTitle("%s result%s : %s" % (len(self.result),
's' if len(self.result) > 1 else '',
self.tb_url.text()))
text_editor = QtGui.QTextEdit(self)
text_editor.setLineWrapMode(QtGui.QTextEdit.NoWrap)
text_editor.setText('\n'.join(self.result))
resultLayout = QtGui.QVBoxLayout()
resultLayout.addWidget(text_editor)
resultDialog.setLayout(resultLayout)
resultDialog.show()
if __name__ == "__main__":
app = QtGui.QApplication(sys.argv)
main = Browser()
main.show()
sys.exit(app.exec_())
@AtonLerin
Copy link

I have extracted your search bar from this code for a personal project (browse 3D elements name in a json file with this kind of QLineEdit search bar). Thanks !

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment