Skip to content

Instantly share code, notes, and snippets.

View Segerberg's full-sized avatar

Andreas Segerberg Segerberg

View GitHub Profile
#!/usr/bin/env python
"""
This utility extracts media urls from tweet jsonl.gz and save them as warc records.
Warcio (https://github.com/webrecorder/warcio) is a dependency and before you can use it you need to:
% pip install warcio
You run it like this:
% python media2warc.py /mnt/tweets/ferguson/tweets-0001.jsonl.gz /mnt/tweets/ferguson/tweets-0001.warc.gz
import feedparser
import sqlite3
import os
from datetime import datetime
import json
"""
Small script to save links with dedup from rss-feeds for archiving with Squidwarc
usage:
<html>
<head>
<link rel="stylesheet" href="https://use.fontawesome.com/releases/v5.5.0/css/all.css" integrity="sha384-B4dIYHKNBt8Bc12p+WXckhzcICo0wtJAoU8YZTY5qE0Id1GSseTk6S+L3BlXeVIU" crossorigin="anonymous">
</head>
<body>
<header>
<script type="text/javascript" src="http://ajax.googleapis.com/ajax/libs/jquery/1.6.2/jquery.min.js"> </script>
<style>
/*
import string
import unicodedata
valid_filename_chars = "-_.() %s%s" % (string.ascii_letters, string.digits)
def clean(filename, whitelist=valid_filename_chars, replace=' '):
for r in replace:
filename = filename.replace(r,'_')
# keep only valid ascii chars
import csv
delimiter = ','
delimiter_number = 3
import logging
errors = 0
logging.basicConfig(filename='file.log', level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
with open('deniro.csv', 'r') as csvfile:
@Segerberg
Segerberg / pagination_example.sql
Created August 24, 2019 10:01 — forked from ssokolow/pagination_example.sql
Reasonably efficient pagination without OFFSET (SQLite version)
-- Reasonably efficient pagination without OFFSET
-- SQLite version (Adapted from MS SQL syntax)
-- Source: http://www.phpbuilder.com/board/showpost.php?p=10376515&postcount=6
SELECT foo, bar, baz, quux FROM table
WHERE oid NOT IN ( SELECT oid FROM table
ORDER BY title ASC LIMIT 50 )
ORDER BY title ASC LIMIT 10
ffmpeg -i input.mp4 -vcodec h264 -acodec mp2 output.mp4
import hashlib
import optparse
from os import walk
from os.path import dirname, isfile, join, relpath,basename
HASH_BLOCK_SIZE = 65536
def hash_file(fname, dig):
if dig == "sha256":
We can make this file beautiful and searchable if this error is corrected: No commas found in this CSV file in line 0.
Datum;Förband;Förband klartext;
1926-07-07;653;Östgöta flygflottilj (F 3);
1926-07-08;653;Östgöta flygflottilj (F 3);
1926-07-28;652;Roslagens flygkår (F 2);
1926-08-09;652;Roslagens flygkår (F 2);
1926-08-15;653;Östgöta flygflottilj (F 3);
1926-08-20;652;Roslagens flygkår (F 2);
1926-08-20;652;Roslagens flygkår (F 2);
1926-08-20;653;Östgöta flygflottilj (F 3);
1926-09-24;653;Östgöta flygflottilj (F 3);
from datasette.database import Database
from datasette import hookimpl
from datasette.utils.asgi import Response
import os
`
async def reload_db(datasette):
spec = datasette.plugin_config('datasette-reload-db')
db_dir = os.listdir(spec['dir'])
databases = datasette.databases