Skip to content

Instantly share code, notes, and snippets.

@MarketingPip
Last active February 12, 2024 16:11
Show Gist options
  • Save MarketingPip/4547daa2b745d620e63325c175c8d5f4 to your computer and use it in GitHub Desktop.
Save MarketingPip/4547daa2b745d620e63325c175c8d5f4 to your computer and use it in GitHub Desktop.
async function getWordInfo(words, langCode = 'en') {
const endpointUrl = 'https://query.wikidata.org/sparql';
const sparqlQuery = `
SELECT ?word ?lemma (GROUP_CONCAT(DISTINCT ?category; separator="|\$|") AS ?grammar)
(GROUP_CONCAT(DISTINCT ?forms; separator="|\$|") AS ?LexIDs)
(GROUP_CONCAT(DISTINCT ?gloss; separator="|\$|") AS ?Senses)
(GROUP_CONCAT(DISTINCT ?feat2; separator="|\$|") AS ?Uses)
(GROUP_CONCAT(DISTINCT ?usagewords; separator="|\$|") AS ?SameMeaning)
WHERE {
VALUES ?word {${words.map(word => `'${word}'@${langCode}`).join(' ')}}
?l a ontolex:LexicalEntry ;
dct:language wd:Q1860 ;
wikibase:lemma ?lemma ;
ontolex:lexicalForm ?form.
OPTIONAL {
?l wikibase:lexicalCategory ?cat .
?cat rdfs:label ?category. FILTER(LANG(?category) = "${langCode}").
}
?l ontolex:lexicalForm ?forms .
?forms wikibase:grammaticalFeature ?features.
?features rdfs:label ?feat2. FILTER(LANG(?feat2) = "${langCode}").
?forms ontolex:representation ?usagewords .
?form ontolex:representation ?word .
?l ontolex:sense ?sense .
?sense skos:definition ?gloss.
FILTER(LANG(?gloss) = "${langCode}")
}
GROUP BY ?word ?lemma`;
const headers = { 'Accept': 'application/sparql-results+json' };
const fullUrl = endpointUrl + '?query=' + encodeURIComponent(sparqlQuery);
try {
const response = await fetch(fullUrl, { headers });
const results = await response.json();
mapToSchema(results);
} catch (error) {
console.error('Error fetching data:', error);
}
}
const getKeyByValue = (obj, value) => Object.keys(obj).find(key => obj[key] === value);
const compromiseMapping = {
CC: 'Conjunction',
CD: 'Cardinal',
DT: 'Determiner',
EX: 'Preposition', //Existential there
FW: 'Expression',
IN: 'Preposition',
JJ: 'Adjective',
JJR: 'Comparative',
JJS: 'Superlative',
MD: 'Modal',
NN: 'Noun',
NNS: 'Plural',
NNP: 'Singular',
NNPS: ' Plural',
POS: 'Possessive',
PRP: 'Pronoun',
RB: 'Adverb',
RBR: 'Comparative',
RBS: 'Superlative',
RP: 'PhrasalVerb',
PDT: 'Determiner',
SYM: 'Expression',
TO: 'Conjunction',
UH: 'Expression',
VB: 'Verb',
VBD: 'PastTense',
VBG: 'Gerund',
VBN: 'Participle', // past participle
VBP: 'PresentTense', // non-3rd person singular present
VBZ: 'PresentTense', // 3rd person singular present
'PRP$': 'Pronoun',
'WP$': 'Possessive',
WDT: 'Determiner',
WP: 'Pronoun',
WRB: 'Adverb',
}
const verbFormsMapping = {
"simple past": "VBD",
"past participle in english": "VBN",
"present participle": "VBG",
"plural": "NNS",
"singular": "NNP",
"third person": "VBZ",
"first person singular": "VBP",
"second person singular": "VB",
"third person plural": "VBP",
"infinitive": "VB",
"present": "VBP",
"past": "VBD",
"gerund": "VBG",
"positive": "JJ",
"comparative": "JJR",
"superlative": "JJS",
};
function mapToSchema(results) {
results = results.results.bindings;
console.log(results)//
const mappedResults = {
words: []
};
results.forEach(result => {
const wordInfo = {
word: result.word.value,
pos: result.grammar.value.split('|$|'),
tags:null,
lemma:null,
wikidata: result.LexIDs.value.split('|$|').map(result => result.split("/").pop()),
forms: null,
senses: []
};
//
if(wordInfo.word.toLowerCase() != result.lemma.value.toLowerCase()){
wordInfo.lemma = result.lemma.value
}
const senses = result.Senses.value.split('|$|');
const uses = result.Uses.value.split('|$|');
const sameMeaning = result.SameMeaning.value.split('|$|');
const postypes = {};
wordInfo.senses = [...senses] || [];
for (let i = 0; i < uses.length; i++) {
postypes[[uses[i]]] = sameMeaning[i] || sameMeaning[sameMeaning.length - 1];
// Need help here spencer - theses arent mapped right. Assuming we need to change SPARQL query?
}
wordInfo.forms = postypes;
const type = getKeyByValue(wordInfo.forms, wordInfo.word)
const penn = verbFormsMapping[type]
wordInfo.tags = {wikidata:type, penn:penn, compromise:compromiseMapping[penn]}
// wordInfo.forms.filter(obj => obj.intent === "2017-07-12T14:41:15");
mappedResults.words.push(wordInfo);
});
console.log(mappedResults);
}
// Example usage:
const wordsToQuery = ['hated', 'hate', 'going', 'go'];
getWordInfo(wordsToQuery);
@MarketingPip
Copy link
Author

@spencermountain - don't know if you ever got to peak at this. But if we solved it I think would be useful. Plus you can feel free to use it to get all your lemmas for words needed etc.... (works properly).

@spencermountain
Copy link

cool way to get wordnet data - glad it's useful for you!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment