sz3n/get pastes

## get pastes
#script for crawling and storing passwd dump on http://psbdmp.com/
from bs4 import BeautifulSoup
import requests
from pymongo import MongoClient
from dateutil import parser
import re

url = 'http://psbdmp.com/'
client = MongoClient('localhost', 27017)
db = client['psbdmp']
#mongodb connexion handlers
con = db['pastes']
emails_con = db['emails']
ips_con = db['ips']
info_con = db['info']
#regex for emails
email_regex = re.compile('([A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4})', re.IGNORECASE)
#regex for ips
ip_regex = re.compile('\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}')

def find_ips_in_html(html):
	if (html == None):
		return set()
	ips_set = set()
  for ip in ip_regex.findall(html):
    ips_set.add(ip)
  return ips_set

def find_emails_in_html(html):
	if (html == None):
		return set()
	email_set = set()
	for email in email_regex.findall(html):
		email_set.add(email)
	return email_set

def get_last_paste_time():
  res=requests.get(url+'dumps/1')
  res=BeautifulSoup(res.content, "html.parser")
  tr = res.find('tr', {'class':None})
  time =  tr.findAll('td')[2].getText()
  return parser.parse(time)

def update():
  try:
    last_paste_time = get_last_paste_time()
    print "updating"
  except Exception as e:
    print "error in getting time"

def populate():
  res=requests.get(url+'dumps/')
  res=BeautifulSoup(res.content, "html.parser")
  lis = res.findAll('li', {'class':'next page'})
  last_page_num = lis[1].find_all('a')[0]['data-ci-pagination-page']

  #loop for extracting the pastes pages
  for n in range(1,int(last_page_num)):
    res=requests.get(url+'dumps/'+str(n))
    res=BeautifulSoup(res.content, "html.parser")
    trs = res.findAll('tr', {'class':None})
    for i in trs:
      link = i.findAll('td')[0].getText('href')
      time =  i.findAll('td')[2].getText()
      title = i.findAll('td')[1].getText()

      raw = requests.get(url+'api/dump/get/'+link)
      post = {"link":link,
              "time":time,
              "title":title,
              "raw":raw.content,
              "source":"pastebin"
             }
      con.insert_one(post)
      #collect emails
      for email in find_emails_in_html(raw.content):
        email_post = {"link":link,
              "time":time,
              "title":title,
              "source":"pastebin",
              "email":email,
              "domain":email.split("@")[1]
             }
        emails_con.insert_one(email_post)
      #collect ips
      for ip in find_ips_in_html(raw.content):
        ip_post = {"link":link,
              "time":time,
              "title":title,
              "source":"pastebin",
              "ip":ip,
             }
        ips_con.insert_one(ip_post)
        ips_con.insert()

argparser = argparse.ArgumentParser(description='Tool for populating or updating the Pastebins db')
argparser.add_argument('--mode', required=True, choices = ['u','p'], help='Choose a mode: u(pdate) or p(opulate). Update the database: the datebase should have already been populated. info.db should exist at the current directory')

args = argparser.parse_args()

if __name__ =='__main__':
  if args.mode == "u":
    update()
  if args.mode == "p":
    populate()
	#script for crawling and storing passwd dump on http://psbdmp.com/
	from bs4 import BeautifulSoup
	import requests
	from pymongo import MongoClient
	from dateutil import parser
	import re

	url = 'http://psbdmp.com/'
	client = MongoClient('localhost', 27017)
	db = client['psbdmp']
	#mongodb connexion handlers
	con = db['pastes']
	emails_con = db['emails']
	ips_con = db['ips']
	info_con = db['info']
	#regex for emails
	email_regex = re.compile('([A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4})', re.IGNORECASE)
	#regex for ips
	ip_regex = re.compile('\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}')

	def find_ips_in_html(html):
	if (html == None):
	return set()
	ips_set = set()
	for ip in ip_regex.findall(html):
	ips_set.add(ip)
	return ips_set

	def find_emails_in_html(html):
	if (html == None):
	return set()
	email_set = set()
	for email in email_regex.findall(html):
	email_set.add(email)
	return email_set

	def get_last_paste_time():
	res=requests.get(url+'dumps/1')
	res=BeautifulSoup(res.content, "html.parser")
	tr = res.find('tr', {'class':None})
	time = tr.findAll('td')[2].getText()
	return parser.parse(time)

	def update():
	try:
	last_paste_time = get_last_paste_time()
	print "updating"
	except Exception as e:
	print "error in getting time"

	def populate():
	res=requests.get(url+'dumps/')
	res=BeautifulSoup(res.content, "html.parser")
	lis = res.findAll('li', {'class':'next page'})
	last_page_num = lis[1].find_all('a')[0]['data-ci-pagination-page']

	#loop for extracting the pastes pages
	for n in range(1,int(last_page_num)):
	res=requests.get(url+'dumps/'+str(n))
	res=BeautifulSoup(res.content, "html.parser")
	trs = res.findAll('tr', {'class':None})
	for i in trs:
	link = i.findAll('td')[0].getText('href')
	time = i.findAll('td')[2].getText()
	title = i.findAll('td')[1].getText()

	raw = requests.get(url+'api/dump/get/'+link)
	post = {"link":link,
	"time":time,
	"title":title,
	"raw":raw.content,
	"source":"pastebin"
	}
	con.insert_one(post)
	#collect emails
	for email in find_emails_in_html(raw.content):
	email_post = {"link":link,
	"time":time,
	"title":title,
	"source":"pastebin",
	"email":email,
	"domain":email.split("@")[1]
	}
	emails_con.insert_one(email_post)
	#collect ips
	for ip in find_ips_in_html(raw.content):
	ip_post = {"link":link,
	"time":time,
	"title":title,
	"source":"pastebin",
	"ip":ip,
	}
	ips_con.insert_one(ip_post)
	ips_con.insert()

	argparser = argparse.ArgumentParser(description='Tool for populating or updating the Pastebins db')
	argparser.add_argument('--mode', required=True, choices = ['u','p'], help='Choose a mode: u(pdate) or p(opulate). Update the database: the datebase should have already been populated. info.db should exist at the current directory')

	args = argparser.parse_args()

	if __name__ =='__main__':
	if args.mode == "u":
	update()
	if args.mode == "p":
	populate()