Skip to content

Instantly share code, notes, and snippets.

View rhanka's full-sized avatar

Fabien Antoine rhanka

  • French Administration
  • Paris
View GitHub Profile
@rhanka
rhanka / pgdump_2_csv.sh
Last active January 25, 2018 12:33
Dirty pg_restore + perl conversion
#!/bin/sh
#get all schemas of the dump
schemas=`cat $1 | docker exec -i postgres pg_restore -l | grep SCHEMA | sed 's/^.*\s//' | sort | uniq`
for schema in $schemas
do echo $schema
for table in `cat $1 | docker exec -i postgres pg_restore -l | egrep "TABLE $schema" |sed "s/.*TABLE $schema //;s/ $schema//' | sort | uniq `
do echo $table
cat $1 | docker exec -i postgres pg_restore -t $table | perl -e 'while(<>){if (/^\\.\s*$/) {$dump=0} if ($dump==1) {s/\\N//g;print} if (/COPY/){$dump=1;s/.*\((.*)\).*/$1/;s/,\s*/\t/g;print}}' > $table.csv
done
@rhanka
rhanka / geocode_addok.py
Last active February 21, 2019 23:12
Geocode with addok/BAN (adresse.data.gouv.fr) in DSS & python 2.7
# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# -*- coding: utf-8 -*-
from multiprocessing import Process, Queue
import dataiku
from dataiku.customrecipe import get_input_names_for_role
from dataiku.customrecipe import get_output_names_for_role
from dataiku.customrecipe import get_recipe_config
import itertools
import logging
import pandas as pd
@rhanka
rhanka / 2018-04-10-fix-file-resource-domain.js
Last active April 10, 2018 18:13
Udata js domain migration for file resources
/*
* resource.filetype: migrate OLD_DOMAINS to NEW_DOMAIN
* WARNING : it uses a basic 'replace' without a strict regex, be careful about the OLD_DOMAINS
*/
var count = 0;
var urlRegex = /^https?\:\/\/alpha.datalab.mi.*$/i;
db.dataset.find({'resources.filetype': 'file'}).forEach(function(dataset) {
if (dataset.resources) {
# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# -*- coding: utf-8 -*-
import dataiku
import os
import json
import pandas as pd, numpy as np
from dataiku import pandasutils as pdu
os.environ['http_proxy'] = ''
import dataikuapi
# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# -*- coding: utf-8 -*-
import dataiku
import pandas as pd, numpy as np
from dataiku import pandasutils as pdu
# from birdy.twitter import UserClient, StreamClient
import json
# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
TWITTER_CONSUMER_KEY = "xxx"
# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# -*- coding: utf-8 -*-
import dataiku
import pandas as pd, numpy as np
from dataiku import pandasutils as pdu
import hashlib
import re
import json
import requests
@rhanka
rhanka / download_brpp.sh
Last active June 13, 2019 21:52
download BRPP INSEE data
#!/bin/bash
#login et mdp doivent être url encoded (uriComponent et pas simplement uri)
set -e
set echo off
first_year=1970
login=MYBRPPLOGIN
password=MYURLENCODEDPASSWORD
# étape 1: création d'un jeton de session (stockage dans cookie1)
curl -s -c cookie1 https://echanges.insee.fr/ihm/download/brpp-deces -o session.html && echo initialisation de session || echo initialisation de session en échec
#!/bin/bash
# ce fichier permet le téléchargement de l'intégralité des données insee et de les compresser
# il necessite l'installation de jq pour parser les json de l'api data.gouv.fr
curl -s https://www.data.gouv.fr/api/1/datasets/fichier-des-personnes-decedees/ | \
jq '.resources[].url' | sed 's/^/curl -s /;s:/\(deces-.*\)":/\1" | gzip > \1.gz:' | sh
#!/bin/bash
for file in `curl -s https://fichier-des-personnes-decedees.s3.fr-par.scw.cloud/ | \
sed 's/<Key>/\n/g' | sed 's/<.*//' | egrep -i 'deces-.*.txt.gz'`;\
do \
(curl -s https://fichier-des-personnes-decedees.s3.fr-par.scw.cloud/echo $file > $file ) && \
echo $file downloaded;\
done
#!/bin/bash
sudo apt-get install python pip wget
pip install jq xq
wget -rl1 https://echanges.dila.gouv.fr/OPENDATA/CASS/
find -iname '*.tar.gz' | xargs tar xvzf
nxml=`find -iname '*.xml'| wc -l`;(find -iname '*.xml' | xargs -n 1 -P 6 -I {} bash -c 'file={};cat $file | xq "." > ${file/xml/json}' &) && while(true);do find -iname '*.json' | wc -l | awk -v nxml=$nxml '{printf "\rconversion xml>json en cours : " $1 "/" nxml}';sleep 1;done