Skip to content

Instantly share code, notes, and snippets.

@neongreen
Created March 20, 2023 11:33
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save neongreen/db44619af642e72980f3d88e06f57595 to your computer and use it in GitHub Desktop.
Save neongreen/db44619af642e72980f3d88e06f57595 to your computer and use it in GitHub Desktop.
voice recorder yeah
import * as openai from "npm:openai"
import outdent from "https://deno.land/x/outdent@v0.8.0/mod.ts"
import ora from "npm:ora"
import fg from "npm:fast-glob"
import { DB } from "https://deno.land/x/sqlite@v3.7.0/mod.ts"
import * as path from "https://deno.land/std@0.180.0/path/mod.ts"
import * as fs from "node:fs"
import PQueue from "npm:p-queue"
import * as MusicMetadata from "npm:music-metadata"
import "https://deno.land/std@0.180.0/dotenv/load.ts"
const TRANSCRIPTION_CONCURRENCY = 10
const LABELLING_CONCURRENCY = 10
const openaiApi = new openai.OpenAIApi(
new openai.Configuration({
apiKey: Deno.env.get("OPENAI_API_KEY")!,
})
)
const memosDb = new DB("memos.sqlite")
memosDb.execute("CREATE TABLE IF NOT EXISTS memos (name, text, label)")
{
let files = await fg("/Volumes/IC RECORDER/REC_FILE/**/*.mp3")
console.log(`Found ${files.length} memos`)
{
const files2 = []
for (const file of files) {
const name = path.basename(file)
const exists =
memosDb.query("SELECT * FROM memos WHERE name = ?", [name]).length > 0
const tooLarge = (await Deno.stat(file)).size > 3_000_000
if (tooLarge) console.log(`Skipping large file ${file}`)
if (!exists && !tooLarge) files2.push(file)
}
files = files2
}
const queue = new PQueue({ concurrency: TRANSCRIPTION_CONCURRENCY })
const spinner = ora(`Transcribing...`).start()
let remaining = files.length
const current = new Set()
const updateSpinner = () => {
spinner.text = `Transcribing... ${remaining}/${files.length} left`
if (current.size > 0)
spinner.text += " [" + Array.from(current.values()).join(" ") + "]"
}
queue.addAll(
files.map((file) => async () => {
const name = path.basename(file)
current.add(name)
updateSpinner()
const meta = await MusicMetadata.parseBuffer(await Deno.readFile(file))
if (meta.format.duration === undefined) {
console.log(`File seems to be corrupted: ${file}`)
} else {
const result = await openaiApi
.createTranscription(fs.createReadStream(file), "whisper-1")
.catch((e) => {
console.error(e)
console.error(e.response.data)
})
if (result && result.data.text) {
memosDb.query("INSERT INTO memos (name, text) VALUES (?, ?)", [
name,
result.data.text,
])
}
}
current.delete(name)
remaining--
updateSpinner()
})
)
await queue.onIdle()
spinner.succeed()
}
// Now categorize the memos
async function categorize(transcript: string) {
const result = await openaiApi
.createChatCompletion({
model: "gpt-3.5-turbo",
messages: [
{
role: "user",
content: outdent`
The following is a transcript of a voice memo recorded by Artyom [...]
Your task is to categorize memos. The following categories are available:
Category "task": a task to do. Examples:
- ...
- ...
Category "problem": a problem that Artyom noticed in his life. Examples:
- ...
- ...
For everything else, output "unknown".
Do not output anything else. Only a single category label without quotation marks. Do not output the word "Category" either. Your output should be EXACTLY one word starting with a lower-case letter.
The voice memo transcript follows after "---".
---
${transcript}
`,
},
],
})
.catch((e) => {
console.error(e)
console.error(e.response.data)
})
if (result && result.data.choices[0].message?.content!) {
return result.data.choices[0].message?.content!.toLowerCase()
}
}
{
const memos: [string, string][] = memosDb.query(
"SELECT name, text FROM memos WHERE label IS NULL"
)
const spinner = ora(`Labelling...`).start()
let remaining = memos.length
const current = new Set()
const updateSpinner = () => {
spinner.text = `Labelling... ${remaining}/${memos.length} left`
if (current.size > 0)
spinner.text += " [" + Array.from(current.values()).join(" ") + "]"
}
const queue = new PQueue({ concurrency: LABELLING_CONCURRENCY })
queue.addAll(
memos.map(([name, text]) => async () => {
current.add(name)
updateSpinner()
const label = await categorize(text)
if (label && /^[a-z]+$/.test(label)) {
memosDb.query("UPDATE memos SET label = ? WHERE name = ?", [
label,
name,
])
} else {
console.error(`${name}: unknown label ${label}`)
}
current.delete(name)
remaining--
updateSpinner()
})
)
await queue.onIdle()
spinner.succeed()
}
memosDb.close()
Deno.exit()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment