Last active
February 12, 2024 16:11
-
-
Save MarketingPip/4547daa2b745d620e63325c175c8d5f4 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
async function getWordInfo(words, langCode = 'en') { | |
const endpointUrl = 'https://query.wikidata.org/sparql'; | |
const sparqlQuery = ` | |
SELECT ?word ?lemma (GROUP_CONCAT(DISTINCT ?category; separator="|\$|") AS ?grammar) | |
(GROUP_CONCAT(DISTINCT ?forms; separator="|\$|") AS ?LexIDs) | |
(GROUP_CONCAT(DISTINCT ?gloss; separator="|\$|") AS ?Senses) | |
(GROUP_CONCAT(DISTINCT ?feat2; separator="|\$|") AS ?Uses) | |
(GROUP_CONCAT(DISTINCT ?usagewords; separator="|\$|") AS ?SameMeaning) | |
WHERE { | |
VALUES ?word {${words.map(word => `'${word}'@${langCode}`).join(' ')}} | |
?l a ontolex:LexicalEntry ; | |
dct:language wd:Q1860 ; | |
wikibase:lemma ?lemma ; | |
ontolex:lexicalForm ?form. | |
OPTIONAL { | |
?l wikibase:lexicalCategory ?cat . | |
?cat rdfs:label ?category. FILTER(LANG(?category) = "${langCode}"). | |
} | |
?l ontolex:lexicalForm ?forms . | |
?forms wikibase:grammaticalFeature ?features. | |
?features rdfs:label ?feat2. FILTER(LANG(?feat2) = "${langCode}"). | |
?forms ontolex:representation ?usagewords . | |
?form ontolex:representation ?word . | |
?l ontolex:sense ?sense . | |
?sense skos:definition ?gloss. | |
FILTER(LANG(?gloss) = "${langCode}") | |
} | |
GROUP BY ?word ?lemma`; | |
const headers = { 'Accept': 'application/sparql-results+json' }; | |
const fullUrl = endpointUrl + '?query=' + encodeURIComponent(sparqlQuery); | |
try { | |
const response = await fetch(fullUrl, { headers }); | |
const results = await response.json(); | |
mapToSchema(results); | |
} catch (error) { | |
console.error('Error fetching data:', error); | |
} | |
} | |
const getKeyByValue = (obj, value) => Object.keys(obj).find(key => obj[key] === value); | |
const compromiseMapping = { | |
CC: 'Conjunction', | |
CD: 'Cardinal', | |
DT: 'Determiner', | |
EX: 'Preposition', //Existential there | |
FW: 'Expression', | |
IN: 'Preposition', | |
JJ: 'Adjective', | |
JJR: 'Comparative', | |
JJS: 'Superlative', | |
MD: 'Modal', | |
NN: 'Noun', | |
NNS: 'Plural', | |
NNP: 'Singular', | |
NNPS: ' Plural', | |
POS: 'Possessive', | |
PRP: 'Pronoun', | |
RB: 'Adverb', | |
RBR: 'Comparative', | |
RBS: 'Superlative', | |
RP: 'PhrasalVerb', | |
PDT: 'Determiner', | |
SYM: 'Expression', | |
TO: 'Conjunction', | |
UH: 'Expression', | |
VB: 'Verb', | |
VBD: 'PastTense', | |
VBG: 'Gerund', | |
VBN: 'Participle', // past participle | |
VBP: 'PresentTense', // non-3rd person singular present | |
VBZ: 'PresentTense', // 3rd person singular present | |
'PRP$': 'Pronoun', | |
'WP$': 'Possessive', | |
WDT: 'Determiner', | |
WP: 'Pronoun', | |
WRB: 'Adverb', | |
} | |
const verbFormsMapping = { | |
"simple past": "VBD", | |
"past participle in english": "VBN", | |
"present participle": "VBG", | |
"plural": "NNS", | |
"singular": "NNP", | |
"third person": "VBZ", | |
"first person singular": "VBP", | |
"second person singular": "VB", | |
"third person plural": "VBP", | |
"infinitive": "VB", | |
"present": "VBP", | |
"past": "VBD", | |
"gerund": "VBG", | |
"positive": "JJ", | |
"comparative": "JJR", | |
"superlative": "JJS", | |
}; | |
function mapToSchema(results) { | |
results = results.results.bindings; | |
console.log(results)// | |
const mappedResults = { | |
words: [] | |
}; | |
results.forEach(result => { | |
const wordInfo = { | |
word: result.word.value, | |
pos: result.grammar.value.split('|$|'), | |
tags:null, | |
lemma:null, | |
wikidata: result.LexIDs.value.split('|$|').map(result => result.split("/").pop()), | |
forms: null, | |
senses: [] | |
}; | |
// | |
if(wordInfo.word.toLowerCase() != result.lemma.value.toLowerCase()){ | |
wordInfo.lemma = result.lemma.value | |
} | |
const senses = result.Senses.value.split('|$|'); | |
const uses = result.Uses.value.split('|$|'); | |
const sameMeaning = result.SameMeaning.value.split('|$|'); | |
const postypes = {}; | |
wordInfo.senses = [...senses] || []; | |
for (let i = 0; i < uses.length; i++) { | |
postypes[[uses[i]]] = sameMeaning[i] || sameMeaning[sameMeaning.length - 1]; | |
// Need help here spencer - theses arent mapped right. Assuming we need to change SPARQL query? | |
} | |
wordInfo.forms = postypes; | |
const type = getKeyByValue(wordInfo.forms, wordInfo.word) | |
const penn = verbFormsMapping[type] | |
wordInfo.tags = {wikidata:type, penn:penn, compromise:compromiseMapping[penn]} | |
// wordInfo.forms.filter(obj => obj.intent === "2017-07-12T14:41:15"); | |
mappedResults.words.push(wordInfo); | |
}); | |
console.log(mappedResults); | |
} | |
// Example usage: | |
const wordsToQuery = ['hated', 'hate', 'going', 'go']; | |
getWordInfo(wordsToQuery); |
@spencermountain - don't know if you ever got to peak at this. But if we solved it I think would be useful. Plus you can feel free to use it to get all your lemmas for words needed etc.... (works properly).
cool way to get wordnet data - glad it's useful for you!
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
@spencermountian - tagged you in this originally to ask for help but I think I got it figured out! (revised the gist uptop). Would be appreciated if you could test it out and see if any errors etc...
Let me know what you think too! 😄