Skip to content

Instantly share code, notes, and snippets.

@taowen
Last active April 15, 2024 08:14
Show Gist options
  • Star 3 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save taowen/4ce9de62255ded695db106ded4aa18c1 to your computer and use it in GitHub Desktop.
Save taowen/4ce9de62255ded695db106ded4aa18c1 to your computer and use it in GitHub Desktop.
extract text from pdf

这个是油猴脚本。安装了暴力猴插件之后,点击上面这个文件的 Raw 按钮,会提示安装。

实现原理

  • pdf.js 可以提取所有的 TextItem 包括文本和包围盒
  • 根据包围盒可以大致判断一下是否换行了
  • 如果一行中包含了公式,那么一行会有很多个 TextItem,这些行会称之为 complex 的行
  • 多个连续的 complex 行变成了 complex 块
  • 如果有 claude 3 haiku 的账号会对 complex 块做一次基于图片的 OCR 来清洗嘈杂的带公式的文本
// ==UserScript==
// @name 拷贝 PDF 中的文本
// @description 方便粘贴到 chatgpt 进行问答
// @namespace github.com/taowen
// @match *://*/*pdf*
// @version 1.0.0
// @author taowen
// @license MIT
// @grant GM.registerMenuCommand
// @grant GM_setClipboard
// @grant GM.getValue
// @grant GM.setValue
// @grant GM.xmlHttpRequest
// ==/UserScript==
GM.registerMenuCommand("复制 Pdf 为 MarkDown", async () => {
const PDFJS = await import('https://unpkg.com/pdfjs-dist/build/pdf.min.mjs');
PDFJS.GlobalWorkerOptions.workerSrc = 'https://unpkg.com/pdfjs-dist/build/pdf.worker.min.mjs';
const doc = await PDFJS.getDocument(window.location.href).promise;
const lines = [];
for (let i = 1; i < doc.numPages + 1; i++) {
const page = await doc.getPage(i)
const textContent = await page.getTextContent();
let currentLineY = 0;
let currentLineText = '';
let currentLineHeight = 0;
for (let item of textContent.items) {
if(item.height === 0) {
continue;
}
const y = item.transform[5]
if (y !== currentLineY && (item.str.length > 4 || currentLineY - y > 11.5)) {
if (currentLineText) {
if (currentLineHeight > 11) {
lines.push('## ' + currentLineText);
} else {
lines.push(currentLineText);
}
}
currentLineText = item.str;
currentLineY = y;
currentLineHeight = item.height;
} else {
currentLineText += item.str;
}
}
lines.push(currentLineText);
}
const allText = lines.join('\n');
GM_setClipboard(allText);
alert('copied ' + allText.length + ' characters');
});
function ocr(imageBase64, referenceText) {
alert('请本地修改 user script,填入 Claude 账号');
throw new Error('请本地修改 user script,填入 Claude 账号');
if (!imageBase64.startsWith('data:image/png;base64,')) {
throw new Error('expect png');
}
imageBase64 = imageBase64.substring('data:image/png;base64,'.length)
return new Promise((resolve, reject) => {
GM.xmlHttpRequest({
method: 'POST',
url: '',
headers: {
"x-api-key": '',
"anthropic-version": "2023-06-01",
"Content-Type": "application/json"
},
data: JSON.stringify({
model: "claude-3-haiku-20240307",
max_tokens: 4000,
temperature: 0,
messages: [{
role: 'user', content: [
{"type": "image", "source": {"type": "base64", "media_type": "image/png", "data": imageBase64}},
{"type": "text", "text": `<referenceText>${referenceText}</referenceText>\n` + 'Markdown latex can have single $ or double $$. Transcribe this paper to markdown with latex exactly.'}
]}, {
role: 'assistant',
content: [{ type: 'text', text: 'Here is the text transcribed to Markdown:\n```markdown'}]
}],
}),
onload: function(response) {
const result = JSON.parse(response.responseText);
const markdown = result['content'][0]['text'];
resolve(markdown)
},
onerror: function(response) {
console.error('failed to ocr', response);
reject(new Error('failed to ocr'))
}
});
})
}
function logImage(url, size = 50) {
const image = new Image();
image.src = url;
image.onload = function() {
var style = [
'font-size: 1px;',
'padding: ' + this.height/100*size + 'px ' + this.width/100*size + 'px;',
'background: url('+ url +') no-repeat;',
'background-size: contain;'
].join(' ');
console.log('%c ', style);
};
}
function blobToBase64(blob) {
const fileReader = new FileReader();
return new Promise(resolve => {
fileReader.onload = e => {
resolve(e.target.result)
}
fileReader.readAsDataURL(blob)
})
}
async function cropImage(img, boundingBox) {
const { x, y, width, height } = boundingBox;
let offscreen = new OffscreenCanvas(width, height);
let ctx = offscreen.getContext('2d');
ctx.drawImage(img, x, y, width, height, 0, 0, width, height);
return await blobToBase64(await offscreen.convertToBlob())
}
async function processComplexBlock(result, img, complexBlock) {
if (complexBlock.length > 4) {
let xMin = Infinity;
let yMin = Infinity;
let xMax = -Infinity;
let yMax = -Infinity;
let referenceText = '';
for (let j = 0; j < complexBlock.length - 1; j++) {
const line = complexBlock[j];
referenceText = referenceText + line[1] + '\n';
for (const { x, y, width, height } of line[2]) {
const xMaxBox = x + width;
const yMaxBox = y + height;
xMin = Math.min(xMin, x);
yMin = Math.min(yMin, y);
xMax = Math.max(xMax, xMaxBox);
yMax = Math.max(yMax, yMaxBox);
}
}
const patch = await cropImage(img, {
x: xMin,
y: yMin,
width: xMax - xMin,
height: yMax - yMin
});
logImage(patch);
console.log(referenceText)
let patchText = await ocr(patch, referenceText);
const end = patchText.lastIndexOf('```');
if (end !== -1) {
patchText = patchText.substring(0, end);
}
patchText = patchText.trim();
console.log(patchText)
result.push(patchText)
result.push(complexBlock[complexBlock.length - 1][1])
} else {
for (const line of complexBlock) {
result.push(line[1])
}
}
}
GM.registerMenuCommand("复制 Pdf 为 MarkDown (用 haiku 清洗)", async () => {
const PDFJS = await import('https://unpkg.com/pdfjs-dist/build/pdf.min.mjs');
PDFJS.GlobalWorkerOptions.workerSrc = 'https://unpkg.com/pdfjs-dist/build/pdf.worker.min.mjs';
const doc = await PDFJS.getDocument(window.location.href).promise;
let result = [];
for (let i = 1; i < doc.numPages + 1; i++) {
const lines = [];
const page = await doc.getPage(i)
const textContent = await page.getTextContent();
const viewport = page.getViewport({ scale: 4 });
var canvas = document.createElement('canvas');
canvas.width = viewport.width;
canvas.height = viewport.height;
await page.render({
canvasContext: canvas.getContext('2d'),
viewport,
}).promise;
const imgUrl = canvas.toDataURL('image/jpeg');
const img = new Image();
await new Promise(resolve => {
img.onload = () => {
resolve(img);
};
img.src = imgUrl;
});
let currentLineY = 0;
let currentLineText = '';
let currentLineHeight = 0;
let currentLineRects = [];
for (let item of textContent.items) {
if(item.height === 0) {
continue;
}
const y = item.transform[5]
if (y !== currentLineY && (item.str.length > 4 || currentLineY - y > 11.5)) {
if (currentLineText) {
const isComplex = currentLineRects.length > 4 && !currentLineText.includes('i.e .') && !currentLineText.includes('e.g .') && !currentLineText.includes('↓') && !currentLineText.includes('↑') && !/^[A-Za-z•.,\s]+$/.test(currentLineText)
if (!isComplex && currentLineHeight > 11) {
lines.push([false, '## ' + currentLineText, currentLineRects]);
} else {
if (isComplex && lines.length) {
lines[lines.length - 1][0] = true;
}
lines.push([isComplex, currentLineText, currentLineRects]);
}
}
currentLineText = '';
currentLineY = y;
currentLineHeight = item.height;
currentLineRects = [];
}
currentLineText = currentLineText + ' ' + item.str;
const rect = { x: item.transform[4] * 4, y: viewport.height - item.transform[5] * 4 - item.height * 2.7, width: item.width * 4, height: item.height * 4.4 };
currentLineRects.push(rect);
}
lines.push([false, currentLineText, [{ x:0, y:0, width:0, height: 0}]]);
let complexBlock = []
let continuousSimple = 0
for (let i = 0; i < lines.length; i++) {
const line = lines[i]
const [isComplex, text, rects] = line;
if (isComplex) {
continuousSimple = 0;
} else {
continuousSimple += 1;
}
if (complexBlock.length === 0) {
if (isComplex) {
complexBlock = [line];
} else {
result.push(text);
}
} else {
if (continuousSimple < 3 && i < lines.length - 1 && Math.abs(rects[0].x - complexBlock[0][2][0].x) < 800) {
complexBlock.push(line);
} else {
await processComplexBlock(result, img, complexBlock);
complexBlock = [];
result.push(text);
}
}
}
}
const allText = result.join('\n');
GM_setClipboard(allText);
alert('copied ' + allText.length + ' characters');
});
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment