Skip to content

Instantly share code, notes, and snippets.

@taowen
Last active March 30, 2024 02:13
Show Gist options
  • Star 4 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save taowen/3a0ee294ae60fd7e8f14f4af81edf38e to your computer and use it in GitHub Desktop.
Save taowen/3a0ee294ae60fd7e8f14f4af81edf38e to your computer and use it in GitHub Desktop.
extract text from https://ar5iv.labs.arxiv.org/
// ==UserScript==
// @name arxiv论文转markdown拷贝到剪贴板
// @description 方便粘贴到 chatgpt 进行问答
// @namespace github.com/taowen
// @match https://ar5iv.labs.arxiv.org/*
// @match https://browse.arxiv.org/html/*
// @match https://arxiv.org/html/*
// @version 1.0.1
// @author taowen
// @license MIT
// @grant GM.registerMenuCommand
// @grant GM_setClipboard
// ==/UserScript==
GM.registerMenuCommand("复制论文到剪贴板", () => {
const selection = document.getSelection();
const targetNode = selection.baseNode ? getCommonAncestor(selection.baseNode, selection.extentNode): document.body;
const text = extractText(targetNode);
GM_setClipboard (text);
alert('copied ' + text.length + ' characters');
});
function getCommonAncestor(node1, node2) {
var method = "contains" in node1 ? "contains" : "compareDocumentPosition",
test = method === "contains" ? 1 : 0x10;
while (node1 = node1.parentNode) {
if ((node1[method](node2) & test) === test)
return node1;
}
return null;
}
function extractText(node, parts) {
let returnString = false;
if (parts === undefined) { parts = []; returnString = true; }
if (node.wholeText !== undefined) { parts.push(node.wholeText.trim()); return; }
const tagName = (node.tagName || '').toLowerCase();
if (tagName === 'button') { return; }
if (tagName === 'h1') { parts.push('\n# ' + node.innerText + '\n'); return; }
if (tagName === 'h2') { parts.push('\n## ' + node.innerText + '\n'); return; }
if (tagName === 'h3') { parts.push('\n### ' + node.innerText + '\n'); return; }
if (tagName === 'h4') { parts.push('\n#### ' + node.innerText + '\n'); return; }
if (tagName === 'h5') { parts.push('\n##### ' + node.innerText + '\n'); return; }
if (tagName === 'h6') { parts.push('\n###### ' + node.innerText + '\n'); return; }
if (tagName === 'math') { parts.push('$' + node.attributes.alttext.value + '$'); return; }
const cssClass = node.attributes && node.attributes.class && node.attributes.class.value;
if (cssClass === 'ltx_note_outer') {
text = '';
for(const child of node.childNodes) {
text += extractText(child)
}
parts.push(`\n> ${text.replace('\n', '')}\n\n`);
} else {
for(const child of node.childNodes) {
extractText(child, parts)
}
}
if (tagName === 'p' || tagName === 'li' || cssClass === 'ltx_listingline') {
parts.push('\n')
}
if (returnString) {
return parts.join(' ')
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment