First, download the data files using a BitTorrent client:
aria2c https://academictorrents.com/download/4dcfdf804775f2d92b7a030305fa0350ebef6f3e.torrent
Next, convert the data files to a single newline-delimited JSON file:
deno run process.ts
import type { NextRequest } from 'next/server' | |
import { createParser } from 'eventsource-parser' | |
export const config = { | |
runtime: 'edge', | |
} | |
export default async function handler(req: NextRequest) { | |
const encoder = new TextEncoder() | |
const decoder = new TextDecoder() |
# brew install awscli | |
# aws configure | |
aws s3 cp your-file.pdf s3://your-bucket/your-file.pdf | |
# https://pypi.org/project/amazon-textract-helper/ | |
# https://github.com/aws-samples/amazon-textract-textractor/tree/master/helper | |
# pip install amazon-textract-helper | |
amazon-textract --input-document s3://your-bucket/your-file.pdf --features TABLES --pretty-print TABLES --pretty-print-table-format=csv | |
# https://aws.amazon.com/blogs/machine-learning/automatically-extract-text-and-structured-data-from-documents-with-amazon-textract/ |
[...document.querySelectorAll('div,main,body')].forEach(node => { | |
node.style.position = 'relative' | |
node.style.height = 'auto' | |
node.style.overflowY = 'visible' | |
}); | |
[...document.querySelectorAll('button')].forEach(node => { | |
node.remove() | |
}); |
get_iplayer --pid m001d2h4 --subtitles --output "m001d2h4" | |
ffmpeg -i m001d2h4/Only_Connect_Series_18_-_07._Scrummagers_v_Crustaceans_m001d2h4_original.mp4 -vf "subtitles=m001d2h4/Only_Connect_Series_18_-_07._Scrummagers_v_Crustaceans_m001d2h4_original.srt" -ss 17:49 -t 5 -copyts output.mov |
lineReader = () => { | |
let buffer = ""; | |
return new TransformStream({ | |
transform(chunk, controller) { | |
buffer += chunk; | |
const parts = buffer.split("\n"); | |
parts.slice(0, -1).forEach((part) => controller.enqueue(part)); | |
buffer = parts[parts.length - 1]; | |
}, |
import { parse } from 'https://deno.land/x/xml@2.0.4/mod.ts' | |
import { readableStreamFromIterable } from 'https://deno.land/std@0.96.0/io/streams.ts' | |
import { Database } from 'https://deno.land/x/sqlite3@0.5.2/mod.ts' | |
import ProgressBar from 'https://deno.land/x/progress@v1.2.7/mod.ts' | |
let counter = 0 | |
const progress = new ProgressBar({ | |
title: 'processing:', | |
interval: 100, |
First, download the data files using a BitTorrent client:
aria2c https://academictorrents.com/download/4dcfdf804775f2d92b7a030305fa0350ebef6f3e.torrent
Next, convert the data files to a single newline-delimited JSON file:
deno run process.ts
export const cloudStorageJsonLinesWriter = (url: string) => { | |
// gcloud components install alpha | |
const process = Deno.run({ | |
cmd: [ | |
'gcloud', | |
'alpha', | |
'storage', | |
'cp', | |
'-', | |
url, |
import { TextLineStream } from 'https://deno.land/std@0.153.0/streams/mod.ts' | |
// const input = await jsonLinesReader('input.jsonl.gz') | |
// const output = await jsonLinesWriter('output.jsonl.gz') | |
// for await (const item of input) { | |
//// do something | |
// await output.write(item) | |
// } |
const createInputReader = async (path: string) => { | |
const file = await Deno.open(path, { | |
read: true, | |
}) | |
return file.readable | |
.pipeThrough(new DecompressionStream('gzip')) | |
.pipeThrough(new TextDecoderStream()) | |
.pipeThrough(new TextLineStream()) | |
.pipeThrough( |