Ed Summers edsu

## extract.py
#!/usr/bin/env python3

import csv
import json

data = json.load(open("outbox.json"))

out = csv.DictWriter(open("outbox.csv", "w"), ["published", "to", "content"])
out.writeheader()

## hostnames.py
from bs4 import BeautifulSoup
from urllib.parse import urlparse
from warcio.archiveiterator import ArchiveIterator

counts = {}

def count_links(record):
    if "html" not in record.http_headers.get("content-type"):
        return
    doc = BeautifulSoup(record.raw_stream, "lxml")

## nytimes-gptbot.sh
#!/bin/bash

#
# Use the Internet Archive Wayback Machine to demonstrate roughly when the
# NYTimes started blocking GPTBot.
#
# See: https://www.theverge.com/2023/8/21/23840705/new-york-times-openai-web-crawler-ai-gpt
#

wget -q -O robots-20230817.txt https://web.archive.org/web/20230817012138id_/https://www.nytimes.com/robots.txt

## extract_images.py
#!/usr/bin/env python3

import sys
import pathlib

from urllib.parse import urlparse
from warcio.archiveiterator import ArchiveIterator

def save(url, stream):
    uri = urlparse(url)

## example.md

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                edsu
                / example.md
            
            
              Last active
              August 24, 2023 14:31
            
          
    Org A split off of Org B, Org B split into Org C & Org D, Org A and Org D merged into Org E?
can be turned into Mermaid notation
graph TD;
  B --> A;
  B --> C;
  B --> D;
 A --&gt; E;


## crawl.yaml
collection: fatal-encounters
generateWACZ: true
workers: 4
screencastPort: 9037
seeds:
  - url: https://fatalencounters.org/
    scopeType: prefix
  - url: https://www.wsoctv.com/news/1-person-dead-after-attempting-escape-police-troopers-say/QXA244QPUZGJ5GAGRADGDWBAEU/
    scopeType: page
  - url: https://www.wtok.com/2022/01/01/officer-involved-shooting/

## check.py
#!/usr/bin/env python

from warcio.archiveiterator import ArchiveIterator

with open('archive/rec-20230722210008512613-81a34b41ee13.warc.gz', 'rb') as stream:
    for i, record in enumerate(ArchiveIterator(stream)):
        print(i, record.rec_headers.get_header('WARC-Target-URI'))
        if record.rec_type == 'response':
            content = record.content_stream().read()

## writer.py
from warcio.warcwriter import WARCWriter

with open('test.warc.gz', 'wb') as output:
    writer = WARCWriter(output, gzip=True)

    # write some metadata for the warc as a info record
    rec = writer.create_warcinfo_record('test.warc.gz', {
        'software': 'warcio',
        'description': 'An example of packaging up two images in a WARC'
    })

## warc2mbox.py
#!/usr/bin/env python3

# run like this:
#
# $ python3 warc2mbox.py yahoo-groups-2016-03-20T12:45:19Z-nyzp9w.warc.gz
#
# and it will generate an mbox file for each Yahoo Group:
#
# $ ls -l mboxes
# -rw-r--r--  1 edsummers  staff    12522488 Jul 15 14:14 amicigranata.mbox

## check-swap.py
#!/usr/bin/env python3

import csv
import sys
import json
import time
import requests

def get_snapshots(url):
    url = f"https://swap.stanford.edu/was/cdx?url={url}&output=json"
	#!/usr/bin/env python3

	import csv
	import json

	data = json.load(open("outbox.json"))

	out = csv.DictWriter(open("outbox.csv", "w"), ["published", "to", "content"])
	out.writeheader()
	from bs4 import BeautifulSoup
	from urllib.parse import urlparse
	from warcio.archiveiterator import ArchiveIterator

	counts = {}

	def count_links(record):
	if "html" not in record.http_headers.get("content-type"):
	return
	doc = BeautifulSoup(record.raw_stream, "lxml")
	#!/bin/bash

	#
	# Use the Internet Archive Wayback Machine to demonstrate roughly when the
	# NYTimes started blocking GPTBot.
	#
	# See: https://www.theverge.com/2023/8/21/23840705/new-york-times-openai-web-crawler-ai-gpt
	#

	wget -q -O robots-20230817.txt https://web.archive.org/web/20230817012138id_/https://www.nytimes.com/robots.txt
	#!/usr/bin/env python3

	import sys
	import pathlib

	from urllib.parse import urlparse
	from warcio.archiveiterator import ArchiveIterator

	def save(url, stream):
	uri = urlparse(url)
	collection: fatal-encounters
	generateWACZ: true
	workers: 4
	screencastPort: 9037
	seeds:
	- url: https://fatalencounters.org/
	scopeType: prefix
	- url: https://www.wsoctv.com/news/1-person-dead-after-attempting-escape-police-troopers-say/QXA244QPUZGJ5GAGRADGDWBAEU/
	scopeType: page
	- url: https://www.wtok.com/2022/01/01/officer-involved-shooting/
	#!/usr/bin/env python

	from warcio.archiveiterator import ArchiveIterator

	with open('archive/rec-20230722210008512613-81a34b41ee13.warc.gz', 'rb') as stream:
	for i, record in enumerate(ArchiveIterator(stream)):
	print(i, record.rec_headers.get_header('WARC-Target-URI'))
	if record.rec_type == 'response':
	content = record.content_stream().read()
	from warcio.warcwriter import WARCWriter

	with open('test.warc.gz', 'wb') as output:
	writer = WARCWriter(output, gzip=True)

	# write some metadata for the warc as a info record
	rec = writer.create_warcinfo_record('test.warc.gz', {
	'software': 'warcio',
	'description': 'An example of packaging up two images in a WARC'
	})
	#!/usr/bin/env python3

	# run like this:
	#
	# $ python3 warc2mbox.py yahoo-groups-2016-03-20T12:45:19Z-nyzp9w.warc.gz
	#
	# and it will generate an mbox file for each Yahoo Group:
	#
	# $ ls -l mboxes
	# -rw-r--r-- 1 edsummers staff 12522488 Jul 15 14:14 amicigranata.mbox