Skip to content

Instantly share code, notes, and snippets.

@dasdachs
Last active May 30, 2017 16:04
Show Gist options
  • Save dasdachs/0697c9135888889d422cd528743ee0ed to your computer and use it in GitHub Desktop.
Save dasdachs/0697c9135888889d422cd528743ee0ed to your computer and use it in GitHub Desktop.
A simple scraper for sicris.si
# -*- coding: UTF-8 -*-
#! usr/bin/python3
__author__ = "Jani Šumak <jani.sumak@gmail.com>"
__version__ = "1.0"
import datetime
import logging
import json
import time
from pymongo import MongoClient
import requests
# Start the script
start = time.clock()
# Setup loging
logger = logging.getLogger()
out = logging.StreamHandler()
f = logging.FileHandler("ReasearcherCrawler.log")
out_formatter = logging.Formatter(
"%(asctime)s - %(levelname)s -%(message)s"
)
f_formatter = logging.Formatter(
"%(asctime)s - %(levelname)s -%(message)s"
)
out.setFormatter(out_formatter)
f.setFormatter(f_formatter)
logger.addHandler(out)
logger.addHandler(f)
# Prepare the crawler
start_url = "http://www.sicris.si"
rest_url = "http://www.sicris.si/Common/rest.aspx?"
# Params are special because the API is not a api by
# by modern standars, so we'll format a url string
fields = "fields="
session_id = "sessionID=1234CRIS12002B01B01A03IZUMBFICDOSKJHS588Nn44131"
entity= "entity="
method_call = "methodCall="
country = "country=SI_JSON"
# Get session ID
s = requests.Session()
s.headers.update({"User-Agent": "ReasearcherCrawler/1.0(jani.sumak@gmail.com)"})
r = s.get(start_url)
# Get reasearchers
r_fields = fields + "rsrid"
r_method_call = method_call + "auto=%20and%20lang=slv"
r_entity = entity + "RSR"
params = "&".join([r_fields, session_id, r_entity, r_method_call, country])
r = s.get(rest_url + params)
reasearchers = r.text.splitlines()[-1]
reasearchers_list = json.loads(reasearchers)
# Setup the database
client = MongoClient()
db = client.db
# Get the reasearchersprint(reasearchers_list[:5])
res_entity = entity + "rsr"
for res in reasearchers_list:
id = res["RSRID"]
res_method_call = "methodCall=id={} and lang=slv".format(id)
params = "&".join([fields, session_id, res_entity, res_method_call, country])
# Wait a bit
time.sleep(5)
r = s.get(rest_url + params)
person = r.text.splitlines()[-1]
person_json = json.loads(person[1:-1])
try:
db.researches.insert(person_json)
logger.info("Inserted %s into database" % id)
except:
logger.warning("Failed to insert %s" % id)
end = time.clock()
value = end - start
print("The program has ended")
print("Time: {}".format(timestamp.strftime('%Y-%m-%d %H:%M:%S')))
print("Written {} JSON objects".format(len(reasearchers_list)))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment