Skip to content

Instantly share code, notes, and snippets.

@sz3n
Last active August 4, 2022 16:12
Show Gist options
  • Save sz3n/4f032037235992eaf3cdff2c2aeb1397 to your computer and use it in GitHub Desktop.
Save sz3n/4f032037235992eaf3cdff2c2aeb1397 to your computer and use it in GitHub Desktop.
get pastes
#script for crawling and storing passwd dump on http://psbdmp.com/
from bs4 import BeautifulSoup
import requests
from pymongo import MongoClient
from dateutil import parser
import re
url = 'http://psbdmp.com/'
client = MongoClient('localhost', 27017)
db = client['psbdmp']
#mongodb connexion handlers
con = db['pastes']
emails_con = db['emails']
ips_con = db['ips']
info_con = db['info']
#regex for emails
email_regex = re.compile('([A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4})', re.IGNORECASE)
#regex for ips
ip_regex = re.compile('\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}')
def find_ips_in_html(html):
if (html == None):
return set()
ips_set = set()
for ip in ip_regex.findall(html):
ips_set.add(ip)
return ips_set
def find_emails_in_html(html):
if (html == None):
return set()
email_set = set()
for email in email_regex.findall(html):
email_set.add(email)
return email_set
def get_last_paste_time():
res=requests.get(url+'dumps/1')
res=BeautifulSoup(res.content, "html.parser")
tr = res.find('tr', {'class':None})
time = tr.findAll('td')[2].getText()
return parser.parse(time)
def update():
try:
last_paste_time = get_last_paste_time()
print "updating"
except Exception as e:
print "error in getting time"
def populate():
res=requests.get(url+'dumps/')
res=BeautifulSoup(res.content, "html.parser")
lis = res.findAll('li', {'class':'next page'})
last_page_num = lis[1].find_all('a')[0]['data-ci-pagination-page']
#loop for extracting the pastes pages
for n in range(1,int(last_page_num)):
res=requests.get(url+'dumps/'+str(n))
res=BeautifulSoup(res.content, "html.parser")
trs = res.findAll('tr', {'class':None})
for i in trs:
link = i.findAll('td')[0].getText('href')
time = i.findAll('td')[2].getText()
title = i.findAll('td')[1].getText()
raw = requests.get(url+'api/dump/get/'+link)
post = {"link":link,
"time":time,
"title":title,
"raw":raw.content,
"source":"pastebin"
}
con.insert_one(post)
#collect emails
for email in find_emails_in_html(raw.content):
email_post = {"link":link,
"time":time,
"title":title,
"source":"pastebin",
"email":email,
"domain":email.split("@")[1]
}
emails_con.insert_one(email_post)
#collect ips
for ip in find_ips_in_html(raw.content):
ip_post = {"link":link,
"time":time,
"title":title,
"source":"pastebin",
"ip":ip,
}
ips_con.insert_one(ip_post)
ips_con.insert()
argparser = argparse.ArgumentParser(description='Tool for populating or updating the Pastebins db')
argparser.add_argument('--mode', required=True, choices = ['u','p'], help='Choose a mode: u(pdate) or p(opulate). Update the database: the datebase should have already been populated. info.db should exist at the current directory')
args = argparser.parse_args()
if __name__ =='__main__':
if args.mode == "u":
update()
if args.mode == "p":
populate()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment