Andreas Segerberg Segerberg

## media2warc.py
#!/usr/bin/env python

"""
This utility extracts media urls from tweet jsonl.gz and  save them as warc records.

Warcio (https://github.com/webrecorder/warcio) is a dependency and before you can use it you need to:
% pip install warcio

You run it like this:
% python media2warc.py /mnt/tweets/ferguson/tweets-0001.jsonl.gz /mnt/tweets/ferguson/tweets-0001.warc.gz

## squid_rss.py
import feedparser
import sqlite3
import os
from datetime import datetime
import json

"""
Small script to save links with dedup from rss-feeds for archiving with Squidwarc

usage:

## index.html
<html>
<head>
<link rel="stylesheet" href="https://use.fontawesome.com/releases/v5.5.0/css/all.css" integrity="sha384-B4dIYHKNBt8Bc12p+WXckhzcICo0wtJAoU8YZTY5qE0Id1GSseTk6S+L3BlXeVIU" crossorigin="anonymous">

</head>
<body>
<header>
<script type="text/javascript" src="http://ajax.googleapis.com/ajax/libs/jquery/1.6.2/jquery.min.js"> </script>
<style>
/*

## normalize_to_ascii.py
import string
import unicodedata

valid_filename_chars = "-_.() %s%s" % (string.ascii_letters, string.digits)

def clean(filename, whitelist=valid_filename_chars, replace=' '):
    for r in replace:
        filename = filename.replace(r,'_')

    # keep only valid ascii chars

## csv_checker.py
import csv
delimiter = ','
delimiter_number = 3
import logging
errors = 0

logging.basicConfig(filename='file.log', level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")

with open('deniro.csv', 'r') as csvfile:

## pagination_example.sql
-- Reasonably efficient pagination without OFFSET
-- SQLite version (Adapted from MS SQL syntax)
-- Source: http://www.phpbuilder.com/board/showpost.php?p=10376515&postcount=6

SELECT foo, bar, baz, quux FROM table
 WHERE oid NOT IN ( SELECT oid FROM table
                    ORDER BY title ASC LIMIT 50 )
 ORDER BY title ASC LIMIT 10

## ffmpeg_commands
 ffmpeg -i input.mp4 -vcodec h264 -acodec mp2 output.mp4

## checksum.py
import hashlib
import optparse
from os import walk
from os.path import dirname, isfile, join, relpath,basename

HASH_BLOCK_SIZE = 65536


def hash_file(fname, dig):
    if dig == "sha256":

## flyghaveri.csv
Datum;Förband;Förband klartext;
1926-07-07;653;Östgöta flygflottilj (F 3);
1926-07-08;653;Östgöta flygflottilj (F 3);
1926-07-28;652;Roslagens flygkår (F 2);
1926-08-09;652;Roslagens flygkår (F 2);
1926-08-15;653;Östgöta flygflottilj (F 3);
1926-08-20;652;Roslagens flygkår (F 2);
1926-08-20;652;Roslagens flygkår (F 2);
1926-08-20;653;Östgöta flygflottilj (F 3);
1926-09-24;653;Östgöta flygflottilj (F 3);

## datasette_reload_db.py
from datasette.database import Database
from datasette import hookimpl
from datasette.utils.asgi import Response
import os
`

async def reload_db(datasette):
    spec = datasette.plugin_config('datasette-reload-db')
    db_dir = os.listdir(spec['dir'])
    databases = datasette.databases
	#!/usr/bin/env python

	"""
	This utility extracts media urls from tweet jsonl.gz and save them as warc records.

	Warcio (https://github.com/webrecorder/warcio) is a dependency and before you can use it you need to:
	% pip install warcio

	You run it like this:
	% python media2warc.py /mnt/tweets/ferguson/tweets-0001.jsonl.gz /mnt/tweets/ferguson/tweets-0001.warc.gz
	import feedparser
	import sqlite3
	import os
	from datetime import datetime
	import json

	"""
	Small script to save links with dedup from rss-feeds for archiving with Squidwarc

	usage:
	<html>
	<head>
	<link rel="stylesheet" href="https://use.fontawesome.com/releases/v5.5.0/css/all.css" integrity="sha384-B4dIYHKNBt8Bc12p+WXckhzcICo0wtJAoU8YZTY5qE0Id1GSseTk6S+L3BlXeVIU" crossorigin="anonymous">

	</head>
	<body>
	<header>
	<script type="text/javascript" src="http://ajax.googleapis.com/ajax/libs/jquery/1.6.2/jquery.min.js"> </script>
	<style>
	/*
	import string
	import unicodedata

	valid_filename_chars = "-_.() %s%s" % (string.ascii_letters, string.digits)

	def clean(filename, whitelist=valid_filename_chars, replace=' '):
	for r in replace:
	filename = filename.replace(r,'_')

	# keep only valid ascii chars
	import csv
	delimiter = ','
	delimiter_number = 3
	import logging
	errors = 0

	logging.basicConfig(filename='file.log', level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")

	with open('deniro.csv', 'r') as csvfile:
	-- Reasonably efficient pagination without OFFSET
	-- SQLite version (Adapted from MS SQL syntax)
	-- Source: http://www.phpbuilder.com/board/showpost.php?p=10376515&postcount=6

	SELECT foo, bar, baz, quux FROM table
	WHERE oid NOT IN ( SELECT oid FROM table
	ORDER BY title ASC LIMIT 50 )
	ORDER BY title ASC LIMIT 10
	import hashlib
	import optparse
	from os import walk
	from os.path import dirname, isfile, join, relpath,basename

	HASH_BLOCK_SIZE = 65536


	def hash_file(fname, dig):
	if dig == "sha256":
	Datum;Förband;Förband klartext;
	1926-07-07;653;Östgöta flygflottilj (F 3);
	1926-07-08;653;Östgöta flygflottilj (F 3);
	1926-07-28;652;Roslagens flygkår (F 2);
	1926-08-09;652;Roslagens flygkår (F 2);
	1926-08-15;653;Östgöta flygflottilj (F 3);
	1926-08-20;652;Roslagens flygkår (F 2);
	1926-08-20;652;Roslagens flygkår (F 2);
	1926-08-20;653;Östgöta flygflottilj (F 3);
	1926-09-24;653;Östgöta flygflottilj (F 3);
	from datasette.database import Database
	from datasette import hookimpl
	from datasette.utils.asgi import Response
	import os
	`

	async def reload_db(datasette):
	spec = datasette.plugin_config('datasette-reload-db')
	db_dir = os.listdir(spec['dir'])
	databases = datasette.databases