Skip to content

Instantly share code, notes, and snippets.

View boogheta's full-sized avatar

Benjamin Ooghe-Tabanou boogheta

View GitHub Profile
@boogheta
boogheta / sandcrawler-test.js
Created December 9, 2014 16:40
Tryouts with sandcrawler on Libération's articles tagged with "sexe"
var sandcrawler = require("sandcrawler"),
//artoo = require("sandcrawler/node_modules/artoo-js"),
artoo = require("artoo-js"),
logger = require("sandcrawler-logger"),
fs = require("fs"),
data = [];
// Declare a plugin
var throttle = function(opts) {
return function(scraper) {
class Bash(object):
@property
def out(self):
return self._wrapped
def __mod__(self, other):
self._wrapped = other
return self
@boogheta
boogheta / ratio_similarity.md
Created April 11, 2017 09:44
Calcul de ratio de similarité entre 2 textes avec difflib en python.md
from difflib import SequenceMatcher
text1 = "Mais pourquoi la petite sirène est-elle aussi super, ce n'est pas comme les méchants poissons"
text2 = "Il était une fois une petite sirène super méchante qui mangeait des poissons"
matcher = SequenceMatcher(None, text1, text2)
blocks = matcher.get_matching_blocks()
for pos1, pos2, size in blocks:
    print(size, pos1, pos2, text1[pos1:pos1+size])
>>> 1 1 5 a
>>> 3 2 15 is 
@boogheta
boogheta / assemble.sh
Created March 22, 2018 10:41
Assemble csv lycées
# On lit les headers dans la première ligne des csv
headers=$(cat */*/*.csv | head -1)
# On réécrit les headers avec nos champs en plus dans le fichier final
echo "Lycee,classe,eleve,$headers" > all.csv
# On itère sur l'arborescence des dossiers et stocker les noms dans des variables
ls | grep Lycée | while read lycee; do
ls "$lycee" | while read classe; do
ls "$lycee/$classe" | grep ".csv$" | sed 's/\.csv$//' | while read eleve; do
@boogheta
boogheta / scrolldown_and_unfold.js
Created August 11, 2014 15:02
Scroll and unfold a webpage within PhantomJS or artoo.js
// Handle script with no argument for DEBUG as an artoo bookmarklet
if (typeof(arguments) == "undefined") {
arguments = [60, 20, 15, function(){console.log("FINISHED!");}];
}
(function(endScript, timeout, idle_timeout, ajax_timeout) {
var timeout = Math.max(30, timeout) * 1000,
idle_timeout = idle_timeout * 1000,
ajax_timeout = ajax_timeout * 1000;
@boogheta
boogheta / parse_factiva_html.py
Created November 12, 2019 16:20
Factiva's html parser
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os, sys, csv, json
from datetime import datetime
from pyquery import PyQuery as pq
# TODO
# - Remove duplicates
# - Filtering
@boogheta
boogheta / instagram.py
Created January 14, 2020 15:59
Collect users and followers on Instagram with fake InstagramAPI
import sys, json
from InstagramAPI import InstagramAPI
from time import sleep
from pprint import pprint
#import pandas as pd
#import requests, urllib, bs4, ssl, json, sys, re
def connect_api(user, password):
api = InstagramAPI(user, password)
retries = 12
@boogheta
boogheta / scan_ports.sh
Created April 21, 2022 12:12
Scan open ports
alarm() {
perl -e '
eval {
$SIG{ALRM} = sub { die };
alarm shift;
system(@ARGV);
};
if ($@) { exit 1 }
' "$@";
}
@boogheta
boogheta / test_port.py
Created April 21, 2022 12:15
Test port open
#!/usr/bin/env python
import sys
from socket import socket
ports = range(int(sys.argv[1]), int(sys.argv[2]))
s = socket()
for port in ports:
try:
print "TEST %s..." % port
This file has been truncated, but you can view the full file.
<?xml version='1.0' encoding='UTF-8'?>
<gexf xmlns="http://gexf.net/1.3" version="1.3" xmlns:viz="http://gexf.net/1.3/viz" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://gexf.net/1.3 http://gexf.net/1.3/gexf.xsd">
<meta lastmodifieddate="2022-07-08">
<creator>Gephi 0.9.3</creator>
<description></description>
</meta>
<graph defaultedgetype="undirected" mode="static">
<attributes class="node" mode="static">
<attribute id="attr_type" title="attr_type" type="string"/>
<attribute id="global_occurrences" title="global_occurrences" type="integer"/>