这个是油猴脚本。安装了暴力猴插件之后,点击上面这个文件的 Raw 按钮,会提示安装。
- pdf.js 可以提取所有的 TextItem 包括文本和包围盒
- 根据包围盒可以大致判断一下是否换行了
- 如果一行中包含了公式,那么一行会有很多个 TextItem,这些行会称之为 complex 的行
- 多个连续的 complex 行变成了 complex 块
- 如果有 claude 3 haiku 的账号会对 complex 块做一次基于图片的 OCR 来清洗嘈杂的带公式的文本
// ==UserScript== | |
// @name 拷贝 PDF 中的文本 | |
// @description 方便粘贴到 chatgpt 进行问答 | |
// @namespace github.com/taowen | |
// @match *://*/*pdf* | |
// @version 1.0.0 | |
// @author taowen | |
// @license MIT | |
// @grant GM.registerMenuCommand | |
// @grant GM_setClipboard | |
// @grant GM.getValue | |
// @grant GM.setValue | |
// @grant GM.xmlHttpRequest | |
// ==/UserScript== | |
GM.registerMenuCommand("复制 Pdf 为 MarkDown", async () => { | |
const PDFJS = await import('https://unpkg.com/pdfjs-dist/build/pdf.min.mjs'); | |
PDFJS.GlobalWorkerOptions.workerSrc = 'https://unpkg.com/pdfjs-dist/build/pdf.worker.min.mjs'; | |
const doc = await PDFJS.getDocument(window.location.href).promise; | |
const lines = []; | |
for (let i = 1; i < doc.numPages + 1; i++) { | |
const page = await doc.getPage(i) | |
const textContent = await page.getTextContent(); | |
let currentLineY = 0; | |
let currentLineText = ''; | |
let currentLineHeight = 0; | |
for (let item of textContent.items) { | |
if(item.height === 0) { | |
continue; | |
} | |
const y = item.transform[5] | |
if (y !== currentLineY && (item.str.length > 4 || currentLineY - y > 11.5)) { | |
if (currentLineText) { | |
if (currentLineHeight > 11) { | |
lines.push('## ' + currentLineText); | |
} else { | |
lines.push(currentLineText); | |
} | |
} | |
currentLineText = item.str; | |
currentLineY = y; | |
currentLineHeight = item.height; | |
} else { | |
currentLineText += item.str; | |
} | |
} | |
lines.push(currentLineText); | |
} | |
const allText = lines.join('\n'); | |
GM_setClipboard(allText); | |
alert('copied ' + allText.length + ' characters'); | |
}); | |
function ocr(imageBase64, referenceText) { | |
alert('请本地修改 user script,填入 Claude 账号'); | |
throw new Error('请本地修改 user script,填入 Claude 账号'); | |
if (!imageBase64.startsWith('data:image/png;base64,')) { | |
throw new Error('expect png'); | |
} | |
imageBase64 = imageBase64.substring('data:image/png;base64,'.length) | |
return new Promise((resolve, reject) => { | |
GM.xmlHttpRequest({ | |
method: 'POST', | |
url: '', | |
headers: { | |
"x-api-key": '', | |
"anthropic-version": "2023-06-01", | |
"Content-Type": "application/json" | |
}, | |
data: JSON.stringify({ | |
model: "claude-3-haiku-20240307", | |
max_tokens: 4000, | |
temperature: 0, | |
messages: [{ | |
role: 'user', content: [ | |
{"type": "image", "source": {"type": "base64", "media_type": "image/png", "data": imageBase64}}, | |
{"type": "text", "text": `<referenceText>${referenceText}</referenceText>\n` + 'Markdown latex can have single $ or double $$. Transcribe this paper to markdown with latex exactly.'} | |
]}, { | |
role: 'assistant', | |
content: [{ type: 'text', text: 'Here is the text transcribed to Markdown:\n```markdown'}] | |
}], | |
}), | |
onload: function(response) { | |
const result = JSON.parse(response.responseText); | |
const markdown = result['content'][0]['text']; | |
resolve(markdown) | |
}, | |
onerror: function(response) { | |
console.error('failed to ocr', response); | |
reject(new Error('failed to ocr')) | |
} | |
}); | |
}) | |
} | |
function logImage(url, size = 50) { | |
const image = new Image(); | |
image.src = url; | |
image.onload = function() { | |
var style = [ | |
'font-size: 1px;', | |
'padding: ' + this.height/100*size + 'px ' + this.width/100*size + 'px;', | |
'background: url('+ url +') no-repeat;', | |
'background-size: contain;' | |
].join(' '); | |
console.log('%c ', style); | |
}; | |
} | |
function blobToBase64(blob) { | |
const fileReader = new FileReader(); | |
return new Promise(resolve => { | |
fileReader.onload = e => { | |
resolve(e.target.result) | |
} | |
fileReader.readAsDataURL(blob) | |
}) | |
} | |
async function cropImage(img, boundingBox) { | |
const { x, y, width, height } = boundingBox; | |
let offscreen = new OffscreenCanvas(width, height); | |
let ctx = offscreen.getContext('2d'); | |
ctx.drawImage(img, x, y, width, height, 0, 0, width, height); | |
return await blobToBase64(await offscreen.convertToBlob()) | |
} | |
async function processComplexBlock(result, img, complexBlock) { | |
if (complexBlock.length > 4) { | |
let xMin = Infinity; | |
let yMin = Infinity; | |
let xMax = -Infinity; | |
let yMax = -Infinity; | |
let referenceText = ''; | |
for (let j = 0; j < complexBlock.length - 1; j++) { | |
const line = complexBlock[j]; | |
referenceText = referenceText + line[1] + '\n'; | |
for (const { x, y, width, height } of line[2]) { | |
const xMaxBox = x + width; | |
const yMaxBox = y + height; | |
xMin = Math.min(xMin, x); | |
yMin = Math.min(yMin, y); | |
xMax = Math.max(xMax, xMaxBox); | |
yMax = Math.max(yMax, yMaxBox); | |
} | |
} | |
const patch = await cropImage(img, { | |
x: xMin, | |
y: yMin, | |
width: xMax - xMin, | |
height: yMax - yMin | |
}); | |
logImage(patch); | |
console.log(referenceText) | |
let patchText = await ocr(patch, referenceText); | |
const end = patchText.lastIndexOf('```'); | |
if (end !== -1) { | |
patchText = patchText.substring(0, end); | |
} | |
patchText = patchText.trim(); | |
console.log(patchText) | |
result.push(patchText) | |
result.push(complexBlock[complexBlock.length - 1][1]) | |
} else { | |
for (const line of complexBlock) { | |
result.push(line[1]) | |
} | |
} | |
} | |
GM.registerMenuCommand("复制 Pdf 为 MarkDown (用 haiku 清洗)", async () => { | |
const PDFJS = await import('https://unpkg.com/pdfjs-dist/build/pdf.min.mjs'); | |
PDFJS.GlobalWorkerOptions.workerSrc = 'https://unpkg.com/pdfjs-dist/build/pdf.worker.min.mjs'; | |
const doc = await PDFJS.getDocument(window.location.href).promise; | |
let result = []; | |
for (let i = 1; i < doc.numPages + 1; i++) { | |
const lines = []; | |
const page = await doc.getPage(i) | |
const textContent = await page.getTextContent(); | |
const viewport = page.getViewport({ scale: 4 }); | |
var canvas = document.createElement('canvas'); | |
canvas.width = viewport.width; | |
canvas.height = viewport.height; | |
await page.render({ | |
canvasContext: canvas.getContext('2d'), | |
viewport, | |
}).promise; | |
const imgUrl = canvas.toDataURL('image/jpeg'); | |
const img = new Image(); | |
await new Promise(resolve => { | |
img.onload = () => { | |
resolve(img); | |
}; | |
img.src = imgUrl; | |
}); | |
let currentLineY = 0; | |
let currentLineText = ''; | |
let currentLineHeight = 0; | |
let currentLineRects = []; | |
for (let item of textContent.items) { | |
if(item.height === 0) { | |
continue; | |
} | |
const y = item.transform[5] | |
if (y !== currentLineY && (item.str.length > 4 || currentLineY - y > 11.5)) { | |
if (currentLineText) { | |
const isComplex = currentLineRects.length > 4 && !currentLineText.includes('i.e .') && !currentLineText.includes('e.g .') && !currentLineText.includes('↓') && !currentLineText.includes('↑') && !/^[A-Za-z•.,\s]+$/.test(currentLineText) | |
if (!isComplex && currentLineHeight > 11) { | |
lines.push([false, '## ' + currentLineText, currentLineRects]); | |
} else { | |
if (isComplex && lines.length) { | |
lines[lines.length - 1][0] = true; | |
} | |
lines.push([isComplex, currentLineText, currentLineRects]); | |
} | |
} | |
currentLineText = ''; | |
currentLineY = y; | |
currentLineHeight = item.height; | |
currentLineRects = []; | |
} | |
currentLineText = currentLineText + ' ' + item.str; | |
const rect = { x: item.transform[4] * 4, y: viewport.height - item.transform[5] * 4 - item.height * 2.7, width: item.width * 4, height: item.height * 4.4 }; | |
currentLineRects.push(rect); | |
} | |
lines.push([false, currentLineText, [{ x:0, y:0, width:0, height: 0}]]); | |
let complexBlock = [] | |
let continuousSimple = 0 | |
for (let i = 0; i < lines.length; i++) { | |
const line = lines[i] | |
const [isComplex, text, rects] = line; | |
if (isComplex) { | |
continuousSimple = 0; | |
} else { | |
continuousSimple += 1; | |
} | |
if (complexBlock.length === 0) { | |
if (isComplex) { | |
complexBlock = [line]; | |
} else { | |
result.push(text); | |
} | |
} else { | |
if (continuousSimple < 3 && i < lines.length - 1 && Math.abs(rects[0].x - complexBlock[0][2][0].x) < 800) { | |
complexBlock.push(line); | |
} else { | |
await processComplexBlock(result, img, complexBlock); | |
complexBlock = []; | |
result.push(text); | |
} | |
} | |
} | |
} | |
const allText = result.join('\n'); | |
GM_setClipboard(allText); | |
alert('copied ' + allText.length + ' characters'); | |
}); |