Skip to content

Instantly share code, notes, and snippets.

View pudo's full-sized avatar

Friedrich Lindenberg pudo

View GitHub Profile
from typing import (
Generic,
Type,
TypeVar,
)
class Statement(object):
pass
S = TypeVar("S", bound=Statement)
@pudo
pudo / crawl.py
Created May 21, 2022 14:52
OCCRP article extractor
import json
import asyncio
import trafilatura
from datetime import datetime
from typing import Optional
import aiohttp
from lxml import html
from urllib.parse import urlparse, urljoin
from sqlmodel import Field, Session, SQLModel, create_engine, select
from followthemoney import model
from followthemoney.types import registry
from followthemoney.graph import Graph, Node
# Some inspiration:
#
# https://github.com/nchah/freebase-mql#mql-and-graphql
# https://rdflib.readthedocs.io/en/stable/intro_to_graphs.html#basic-triple-matching
# Queries:
database: $DATAVAULT_DATABASE_URI
source:
slug: zz_every_politician
title: "EveryPolitician.org"
url: http://everypolitician.org/
tables:
- zz_every_politician
mappings:
politicians:
schema:
@pudo
pudo / spec.yaml
Created August 9, 2016 06:09
Arrays in JSON Mappings
database: $DATAVAULT_DATABASE_URI
source:
slug: gb_land_registry
title: "UK Overseas Land Owners"
url: https://www.whatdotheyknow.com/request/overseas_company_properties_titl
tables:
- gb_land_registry
mappings:
lands:
schema:
title layout
Tools
default

Influence Mapping Tools

@pudo
pudo / deployscraper.sh
Last active February 4, 2016 14:52
deployscraper
apt-get update -qq
apt-get install -y -q tmux vim git python-virtualenv curl python-pip build-essential python-dev libxml2-dev libxslt1-dev libpq-dev apt-utils ca-certificates unrar-free unzip
pip install lxml dataset metafolder requests csvkit psycopg2 thready unicodecsv
mkdir -p /srv/data/source
chmod 0600 ~/.ssh/id_rsa
echo "ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQCrzARRFK4JzzvNLzKvWMlnlFBO9D9g3mxvr9Cji4K7psaXHMYN8sv5z+7HaNgee3mpXWvDNnkwZRJj0Cv+H2KxSwfOCRD0ubVmGfLh+3PKlso3bq5aHN0DOcK44Z7j/twE9SMumzpIWGSRrdghGu+XaKu90y/NFxLCwUhZ4EmMf+hFup24c8JLFZ4GFVTZCGveTUri7a9g57RTkJPOuJNDrOCTL3dw1ebxhd8TBm3EfeQ0Fj8o7k0PojJ6OcHpV8Evy4h455ioiJfHrrKif3JUlzC0H2eKWGEQHwEYBjYZVl0dktL7pP7iDG/zO7lzMwgCjXAjYZH+I4zFP8L9le93 occrpscraper" >>~/.ssh/authorized_keys
echo "ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQCrzARRFK4JzzvNLzKvWMlnlFBO9D9g3mxvr9Cji4K7psaXHMYN8sv5z+7HaNgee3mpXWvDNnkwZRJj0Cv+H2KxSwfOCRD0ubVmGfLh+3PKlso3bq5aHN0DOcK44Z7j/twE9SMumzpIWGSRrdghGu+XaKu90y/NFxLCwUhZ4EmMf+hFup24c8JLFZ4GFVTZCGveTUri7a9g57RTkJPOuJNDrOCTL3dw1ebxhd8
@pudo
pudo / test.py
Created May 18, 2015 17:11
scrape wikipedia category pages
import json
import re
import mwclient
import unicodecsv
site = mwclient.Site('en.wikipedia.org')
disam = re.compile('\(.*\)$')
def get_pages(cat):
@pudo
pudo / upload_s3.py
Created May 18, 2015 08:20
Failing to upload via S3 from morph.
import os
import boto
import boto.s3
from boto.s3.key import Key
def upload():
conn = boto.connect_s3(os.environ.get('MORPH_AWS_ACCESS_KEY_ID'),
os.environ.get('MORPH_AWS_SECRET_ACCESS_KEY'),
validate_certs=False)
@pudo
pudo / schema_proposal.yaml
Created May 5, 2015 18:32
A proposed metadata structure for OpenSpending raw data.
# This is an alternate proposal for a metadata structure for OpenSpending
# data models. The most significant change is that data is modelled in a
# way that highlights logical connections between fields, rather based on
# columns. This also means that column naming conventions are not needed.
#
# This proposal uses YAML to represent the model, but implementations
# would probably use JSON instead.
# The proposed format is currently supported by spendb and cubepress.
#
# The following is a data model for a fictitious budget/spending dataset.