@spencermountain
Do not want to open an issue nor do I know how to contact you! lol (plus this is just a ton of random stuff but hopefully it will be some benefit to you / the compromise project).
Hoping you can take a look at these - maybe we can get to porting any rules over. (I have basic quanity parser from while back was hoping to share with you)
https://github.com/facebook/duckling - I am sure you are familar.
https://github.com/facebook/duckling/blob/7520daaeba28691cda8e1b5c3d946028a28fb64b/Duckling/Temperature/EN/Corpus.hs < corpus for tempatures (one I think we could do easy).
As well, wanted to drop this here. (Could be very useful for identifying names with match, #FirstName (suffixes shoved here so we don't fXck the lexicon).)
https://en.wikipedia.org/wiki/List_of_family_name_affixes
(Dutch ones are useful too) - being things like my last name are commonly spelled / written as "van Valkengoed".
https://en.wikipedia.org/wiki/Tussenvoegsel < a list of suffixes that can help identify these (#FirstName #DutchSuffix) etc..
This might or might not be use to you. But these are the top 2 letters found from over 40k male / female names. Which oddly I found it works WEIRDLY well.
Check it out -
const data = {"female":["na","ia","ne","ta","la","ra","ah","da","ka","ie","ja","sa","ha","te","le"],"male":["an","on","ng","us","in","io","er","as","ar","el","is","ro","no","en","ur"],"top_chars":{"female":"na","male":"an"}}
function transformData(data) {
const transformedData = {
"female": [],
"male": [],
"top_chars": {}
};
for (const gender in data) {
if (data.hasOwnProperty(gender)) {
const keys = data[gender];
// Extract key names into arrays
transformedData[gender] = keys;
// Use the first item as the top item for the "top_chars" key (already have been sorted)
transformedData["top_chars"][gender] = keys[0];
}
}
return transformedData;
}
const transformedResult = transformData(data);
console.log(transformedResult);
//console.log(exampleData)
function findGender(name) {
const data = transformedResult
//console.log(data)
const lastTwoLetters = name.slice(-2);
if (data.female.includes(lastTwoLetters)) {
return { name, gender: "female" };
} else if (data.male.includes(lastTwoLetters)) {
return { name, gender: "male" };
} else {
return { name, gender: "unknown" };
}
}
function testFindGender() {
const testData = [
{ name: "Lana", expected: "female" },
{ name: "Brian", expected: "male" },
{ name: "Alex", expected: "male" },
{ name: "Maria", expected: "female" },
{ name: "Daniel", expected: "male" },
{ name: "Sophia", expected: "female" },
{ name: "John", expected: "male" },
{ name: "Eva", expected: "female" },
{ name: "Samuel", expected: "male" },
{ name: "Olivia", expected: "female" },
{ name: "William", expected: "male" },
{ name: "Isabella", expected: "female" },
{ name: "Ethan", expected: "male" },
{ name: "Ava", expected: "female" },
{ name: "Matthew", expected: "male" },
{ name: "Emily", expected: "female" },
{ name: "Christopher", expected: "male" },
{ name: "Emma", expected: "female" },
{ name: "Michael", expected: "male" },
{ name: "Jared", expected: "male" },
{ name: "Sierra", expected: "female" },
{ name: "John", expected: "male" },
{ name: "Mary", expected: "female" },
{ name: "Alex", expected: "male" },
{ name: "Emily", expected: "female" },
{ name: "Chris", expected: "male" },
{ name: "Taylor", expected: "female" },
{ name: "Jordan", expected: "male" },
{ name: "Amanda", expected: "female" },
{ name: "Ryan", expected: "male" },
{ name: "Emma", expected: "female" },
{ name: "Amelia", expected: "female" },
{ name: "Sean", expected: "male" },
{ name: "Noah", expected: "male" },
{ name: "Olivia", expected: "female" },
{ name: "Grace", expected: "female" },
{ name: "Lucas", expected: "male" },
{ name: "Chloe", expected: "female" },
{ name: "Aaron", expected: "male" },
{ name: "Mia", expected: "female" },
{ name: "Jacob", expected: "male" },
{ name: "Sophie", expected: "female" },
{ name: "Nicholas", expected: "male" },
{ name: "Zoe", expected: "female" },
{ name: "Justin", expected: "male" },
{ name: "Ella", expected: "female" },
{ name: "Caleb", expected: "male" },
{ name: "Avery", expected: "female" },
{ name: "Dylan", expected: "male" },
{ name: "Victoria", expected: "female" },
{ name: "Gabriel", expected: "male" },
{ name: "Hannah", expected: "female" },
{ name: "Jordan", expected: "male" },
{ name: "Natalie", expected: "female" },
{ name: "Cameron", expected: "male" },
{ name: "Leah", expected: "female" },
{ name: "Logan", expected: "male" },
{ name: "Madison", expected: "female" },
{ name: "Owen", expected: "male" },
{ name: "Aria", expected: "female" },
{ name: "Connor", expected: "male" },
{ name: "Sofia", expected: "female" },
{ name: "Mason", expected: "male" },
{ name: "Lily", expected: "female" },
{ name: "Evan", expected: "male" },
{ name: "Scarlett", expected: "female" },
{ name: "Nathan", expected: "male" },
{ name: "Addison", expected: "female" },
{ name: "Isaac", expected: "male" },
{ name: "Brooklyn", expected: "female" },
{ name: "Jackson", expected: "male" },
{ name: "Claire", expected: "female" },
{ name: "Tyler", expected: "male" },
{ name: "Peyton", expected: "female" },
{ name: "Henry", expected: "male" },
{ name: "Grace", expected: "female" },
{ name: "Elijah", expected: "male" },
{ name: "Mackenzie", expected: "female" },
{ name: "Colton", expected: "male" },
{ name: "Katherine", expected: "female" },
{ name: "Julian", expected: "male" },
{ name: "Aubrey", expected: "female" },
{ name: "Brayden", expected: "male" },
{ name: "Lillian", expected: "female" },
{ name: "Wyatt", expected: "male" },
{ name: "Riley", expected: "female" },
{ name: "Gabriel", expected: "male" },
{ name: "Ellie", expected: "female" },
{ name: "Dominic", expected: "male" },
{ name: "Harper", expected: "female" },
{ name: "Isaiah", expected: "male" },
{ name: "Annabelle", expected: "female" },
{ name: "Levi", expected: "male" },
{ name: "Aaliyah", expected: "female" },
{ name: "Jaxon", expected: "male" },
{ name: "Zara", expected: "female" },
{ name: "Josiah", expected: "male" },
{ name: "Mila", expected: "female" },
{ name: "Xavier", expected: "male" },
{ name: "Nora", expected: "female" },
{ name: "Sebastian", expected: "male" },
{ name: "Savannah", expected: "female" },
{ name: "Jonathan", expected: "male" },
{ name: "Alyssa", expected: "female" },
{ name: "Benjamin", expected: "male" },
{ name: "Penelope", expected: "female" },
{ name: "Carter", expected: "male" },
{ name: "Aria", expected: "female" },
{ name: "Liam", expected: "male" },
{ name: "Addison", expected: "female" },
// Add more test cases as needed
];
let passMale = 0;
let passFemale = 0;
let passUnknown = 0;
testData.forEach((data) => {
const result = findGender(data.name);
const status = result.gender === data.expected ? "PASS" : "FAIL";
console.log(`${data.name}: Expected ${data.expected}, Got ${result.gender} - ${status}`);
if (status === "PASS") {
if (result.gender === "male") {
passMale++;
} else if (result.gender === "female") {
passFemale++;
} else {
passUnknown++;
}
}
});
const totalTests = testData.length;
const successRate = ((passMale + passFemale + passUnknown) / totalTests) * 100;
console.log("\nFinal Results:");
console.log(`Passed - Male: ${passMale}`);
console.log(`Passed - Female: ${passFemale}`);
console.log(`Passed - Unknown: ${passUnknown}`);
console.log(`Success Rate: ${successRate.toFixed(2)}% - based on ${totalTests} names`);
}
// Run the test
testFindGender();
I was thinking maybe we could add this into Compromise.js as a fall back to classify names. I was also curious / still have to play with results for 3 characters. (maybe a double rule check / probability etc...)
I have a list of names such as "Jake":["Jacob", "Jack", "Jac"], unfortantely not sorted by most common name. But would be cool if we could have a function to normalize the name such as "Spence" to "Spencer".
I have scraped a huge list of various family names (oddly even my last name was in there) - a small list / but again a vary non biased / very cultural list. (sorted by language)
You can view example of data that was scraped here - https://en.wikipedia.org/wiki/Category:Dutch-language_surnames
Let me know if this is use to you - if so I will add you to the repo.
Hope all is well, I know I have some other random thoughts / resources etc. I wanted to share with you - been feeling crazy sick + burn out past few weeks but I will try to commit some useful things to wtf_wikipedia + compromise asap.
Cheers!
Updated as of 22nd Jan!
Another thing leaving here - just playing around tonight. Tho might help with name parser if we wrote this in Compromise rule set. (Maybe not useful at all but)..
Court Parser
Updated version with test case: