Skip to content

Instantly share code, notes, and snippets.

@MarketingPip
Last active February 12, 2024 16:11
Show Gist options
  • Save MarketingPip/4547daa2b745d620e63325c175c8d5f4 to your computer and use it in GitHub Desktop.
Save MarketingPip/4547daa2b745d620e63325c175c8d5f4 to your computer and use it in GitHub Desktop.
async function getWordInfo(words, langCode = 'en') {
const endpointUrl = 'https://query.wikidata.org/sparql';
const sparqlQuery = `
SELECT ?word ?lemma (GROUP_CONCAT(DISTINCT ?category; separator="|\$|") AS ?grammar)
(GROUP_CONCAT(DISTINCT ?forms; separator="|\$|") AS ?LexIDs)
(GROUP_CONCAT(DISTINCT ?gloss; separator="|\$|") AS ?Senses)
(GROUP_CONCAT(DISTINCT ?feat2; separator="|\$|") AS ?Uses)
(GROUP_CONCAT(DISTINCT ?usagewords; separator="|\$|") AS ?SameMeaning)
WHERE {
VALUES ?word {${words.map(word => `'${word}'@${langCode}`).join(' ')}}
?l a ontolex:LexicalEntry ;
dct:language wd:Q1860 ;
wikibase:lemma ?lemma ;
ontolex:lexicalForm ?form.
OPTIONAL {
?l wikibase:lexicalCategory ?cat .
?cat rdfs:label ?category. FILTER(LANG(?category) = "${langCode}").
}
?l ontolex:lexicalForm ?forms .
?forms wikibase:grammaticalFeature ?features.
?features rdfs:label ?feat2. FILTER(LANG(?feat2) = "${langCode}").
?forms ontolex:representation ?usagewords .
?form ontolex:representation ?word .
?l ontolex:sense ?sense .
?sense skos:definition ?gloss.
FILTER(LANG(?gloss) = "${langCode}")
}
GROUP BY ?word ?lemma`;
const headers = { 'Accept': 'application/sparql-results+json' };
const fullUrl = endpointUrl + '?query=' + encodeURIComponent(sparqlQuery);
try {
const response = await fetch(fullUrl, { headers });
const results = await response.json();
mapToSchema(results);
} catch (error) {
console.error('Error fetching data:', error);
}
}
const getKeyByValue = (obj, value) => Object.keys(obj).find(key => obj[key] === value);
const compromiseMapping = {
CC: 'Conjunction',
CD: 'Cardinal',
DT: 'Determiner',
EX: 'Preposition', //Existential there
FW: 'Expression',
IN: 'Preposition',
JJ: 'Adjective',
JJR: 'Comparative',
JJS: 'Superlative',
MD: 'Modal',
NN: 'Noun',
NNS: 'Plural',
NNP: 'Singular',
NNPS: ' Plural',
POS: 'Possessive',
PRP: 'Pronoun',
RB: 'Adverb',
RBR: 'Comparative',
RBS: 'Superlative',
RP: 'PhrasalVerb',
PDT: 'Determiner',
SYM: 'Expression',
TO: 'Conjunction',
UH: 'Expression',
VB: 'Verb',
VBD: 'PastTense',
VBG: 'Gerund',
VBN: 'Participle', // past participle
VBP: 'PresentTense', // non-3rd person singular present
VBZ: 'PresentTense', // 3rd person singular present
'PRP$': 'Pronoun',
'WP$': 'Possessive',
WDT: 'Determiner',
WP: 'Pronoun',
WRB: 'Adverb',
}
const verbFormsMapping = {
"simple past": "VBD",
"past participle in english": "VBN",
"present participle": "VBG",
"plural": "NNS",
"singular": "NNP",
"third person": "VBZ",
"first person singular": "VBP",
"second person singular": "VB",
"third person plural": "VBP",
"infinitive": "VB",
"present": "VBP",
"past": "VBD",
"gerund": "VBG",
"positive": "JJ",
"comparative": "JJR",
"superlative": "JJS",
};
function mapToSchema(results) {
results = results.results.bindings;
console.log(results)//
const mappedResults = {
words: []
};
results.forEach(result => {
const wordInfo = {
word: result.word.value,
pos: result.grammar.value.split('|$|'),
tags:null,
lemma:null,
wikidata: result.LexIDs.value.split('|$|').map(result => result.split("/").pop()),
forms: null,
senses: []
};
//
if(wordInfo.word.toLowerCase() != result.lemma.value.toLowerCase()){
wordInfo.lemma = result.lemma.value
}
const senses = result.Senses.value.split('|$|');
const uses = result.Uses.value.split('|$|');
const sameMeaning = result.SameMeaning.value.split('|$|');
const postypes = {};
wordInfo.senses = [...senses] || [];
for (let i = 0; i < uses.length; i++) {
postypes[[uses[i]]] = sameMeaning[i] || sameMeaning[sameMeaning.length - 1];
// Need help here spencer - theses arent mapped right. Assuming we need to change SPARQL query?
}
wordInfo.forms = postypes;
const type = getKeyByValue(wordInfo.forms, wordInfo.word)
const penn = verbFormsMapping[type]
wordInfo.tags = {wikidata:type, penn:penn, compromise:compromiseMapping[penn]}
// wordInfo.forms.filter(obj => obj.intent === "2017-07-12T14:41:15");
mappedResults.words.push(wordInfo);
});
console.log(mappedResults);
}
// Example usage:
const wordsToQuery = ['hated', 'hate', 'going', 'go'];
getWordInfo(wordsToQuery);
@MarketingPip
Copy link
Author

@spencermountain - don't mean to be a bothersome. Was hoping on to tag you when I had finished results but stumped right now (doesn't help I think I have a concussion right now lol).

You can see the features for words like this here.

And you can try visualizing it here.

Some words have multiple uses - but not sure how to properly map them.

Example the query returns:

Uses:
present participle|$|simple present|$|simple past|$|past participle in English|$|third person|$|singular

Words:
hating|$|hates|$|hate|$|hated

Which it would be ideal to have them mapped in lexicon -

hated:[past participle,
          simple past]

Again - apologizes to ask for your help, but I figured you wouldn't mind considering I built this mostly to help you / the compromise.js project lol.

Hopefully this helps somewhat - took me hours to figure out proper query and not get crazy timeouts etc. lol (plus you won't have to worry about me making PR's with word's that do not have proper tags now hahaha!)

@MarketingPip
Copy link
Author

@spencermountian - tagged you in this originally to ask for help but I think I got it figured out! (revised the gist uptop). Would be appreciated if you could test it out and see if any errors etc...

Let me know what you think too! 😄

@MarketingPip
Copy link
Author

@spencermountain - don't know if you ever got to peak at this. But if we solved it I think would be useful. Plus you can feel free to use it to get all your lemmas for words needed etc.... (works properly).

@spencermountain
Copy link

cool way to get wordnet data - glad it's useful for you!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment