Skip to content

Instantly share code, notes, and snippets.

@MarketingPip
Last active January 23, 2024 03:22
Show Gist options
  • Save MarketingPip/9ebe945b5e6b72f12d1f2f7e449092b5 to your computer and use it in GitHub Desktop.
Save MarketingPip/9ebe945b5e6b72f12d1f2f7e449092b5 to your computer and use it in GitHub Desktop.

Hey Spencer

@spencermountain

Do not want to open an issue nor do I know how to contact you! lol (plus this is just a ton of random stuff but hopefully it will be some benefit to you / the compromise project).

Port duckling over

Hoping you can take a look at these - maybe we can get to porting any rules over. (I have basic quanity parser from while back was hoping to share with you)

https://github.com/facebook/duckling - I am sure you are familar.

https://github.com/facebook/duckling/blob/7520daaeba28691cda8e1b5c3d946028a28fb64b/Duckling/Temperature/EN/Corpus.hs < corpus for tempatures (one I think we could do easy).


Improved Suffixes / Affixes / Name Detection

As well, wanted to drop this here. (Could be very useful for identifying names with match, #FirstName (suffixes shoved here so we don't fXck the lexicon).)

https://en.wikipedia.org/wiki/List_of_family_name_affixes

(Dutch ones are useful too) - being things like my last name are commonly spelled / written as "van Valkengoed".

https://en.wikipedia.org/wiki/Tussenvoegsel < a list of suffixes that can help identify these (#FirstName #DutchSuffix) etc..


Gender Classification

This might or might not be use to you. But these are the top 2 letters found from over 40k male / female names. Which oddly I found it works WEIRDLY well.

Check it out -

const data = {"female":["na","ia","ne","ta","la","ra","ah","da","ka","ie","ja","sa","ha","te","le"],"male":["an","on","ng","us","in","io","er","as","ar","el","is","ro","no","en","ur"],"top_chars":{"female":"na","male":"an"}}

function transformData(data) {
  const transformedData = {
    "female": [],
    "male": [],
    "top_chars": {}
  };

  for (const gender in data) {
    if (data.hasOwnProperty(gender)) {
      const keys = data[gender];

      // Extract key names into arrays
      transformedData[gender] = keys;

      // Use the first item as the top item for the "top_chars" key (already have been sorted)
      transformedData["top_chars"][gender] = keys[0];
    }
  }

  return transformedData;
}


const transformedResult = transformData(data);
console.log(transformedResult);

//console.log(exampleData)

function findGender(name) {

  const data = transformedResult
//console.log(data)
  const lastTwoLetters = name.slice(-2);

  if (data.female.includes(lastTwoLetters)) {
    return { name, gender: "female" };
  } else if (data.male.includes(lastTwoLetters)) {
    return { name, gender: "male" };
  } else {
    return { name, gender: "unknown" };
  }
}

function testFindGender() {
  const testData = [
  { name: "Lana", expected: "female" },
  { name: "Brian", expected: "male" },
  { name: "Alex", expected: "male" },
  { name: "Maria", expected: "female" },
  { name: "Daniel", expected: "male" },
  { name: "Sophia", expected: "female" },
  { name: "John", expected: "male" },
  { name: "Eva", expected: "female" },
  { name: "Samuel", expected: "male" },
  { name: "Olivia", expected: "female" },
  { name: "William", expected: "male" },
  { name: "Isabella", expected: "female" },
  { name: "Ethan", expected: "male" },
  { name: "Ava", expected: "female" },
  { name: "Matthew", expected: "male" },
  { name: "Emily", expected: "female" },
  { name: "Christopher", expected: "male" },
  { name: "Emma", expected: "female" },
  { name: "Michael", expected: "male" },
  { name: "Jared", expected: "male" },
  { name: "Sierra", expected: "female" },
  { name: "John", expected: "male" },
  { name: "Mary", expected: "female" },
  { name: "Alex", expected: "male" },
  { name: "Emily", expected: "female" },
  { name: "Chris", expected: "male" },
  { name: "Taylor", expected: "female" },
  { name: "Jordan", expected: "male" },
  { name: "Amanda", expected: "female" },
  { name: "Ryan", expected: "male" },
  { name: "Emma", expected: "female" },
  { name: "Amelia", expected: "female" },
  { name: "Sean", expected: "male" },
  { name: "Noah", expected: "male" },
  { name: "Olivia", expected: "female" },
  { name: "Grace", expected: "female" },
  { name: "Lucas", expected: "male" },
  { name: "Chloe", expected: "female" },
  { name: "Aaron", expected: "male" },
  { name: "Mia", expected: "female" },
  { name: "Jacob", expected: "male" },
  { name: "Sophie", expected: "female" },
  { name: "Nicholas", expected: "male" },
  { name: "Zoe", expected: "female" },
  { name: "Justin", expected: "male" },
  { name: "Ella", expected: "female" },
  { name: "Caleb", expected: "male" },
  { name: "Avery", expected: "female" },
  { name: "Dylan", expected: "male" },
  { name: "Victoria", expected: "female" },
  { name: "Gabriel", expected: "male" },
  { name: "Hannah", expected: "female" },
  { name: "Jordan", expected: "male" },
  { name: "Natalie", expected: "female" },
  { name: "Cameron", expected: "male" },
  { name: "Leah", expected: "female" },
  { name: "Logan", expected: "male" },
  { name: "Madison", expected: "female" },
  { name: "Owen", expected: "male" },
  { name: "Aria", expected: "female" },
  { name: "Connor", expected: "male" },
  { name: "Sofia", expected: "female" },
  { name: "Mason", expected: "male" },
  { name: "Lily", expected: "female" },
  { name: "Evan", expected: "male" },
  { name: "Scarlett", expected: "female" },
  { name: "Nathan", expected: "male" },
  { name: "Addison", expected: "female" },
  { name: "Isaac", expected: "male" },
  { name: "Brooklyn", expected: "female" },
  { name: "Jackson", expected: "male" },
  { name: "Claire", expected: "female" },
  { name: "Tyler", expected: "male" },
  { name: "Peyton", expected: "female" },
  { name: "Henry", expected: "male" },
  { name: "Grace", expected: "female" },
  { name: "Elijah", expected: "male" },
  { name: "Mackenzie", expected: "female" },
  { name: "Colton", expected: "male" },
  { name: "Katherine", expected: "female" },
  { name: "Julian", expected: "male" },
  { name: "Aubrey", expected: "female" },
  { name: "Brayden", expected: "male" },
  { name: "Lillian", expected: "female" },
  { name: "Wyatt", expected: "male" },
  { name: "Riley", expected: "female" },
  { name: "Gabriel", expected: "male" },
  { name: "Ellie", expected: "female" },
  { name: "Dominic", expected: "male" },
  { name: "Harper", expected: "female" },
  { name: "Isaiah", expected: "male" },
  { name: "Annabelle", expected: "female" },
  { name: "Levi", expected: "male" },
  { name: "Aaliyah", expected: "female" },
  { name: "Jaxon", expected: "male" },
  { name: "Zara", expected: "female" },
  { name: "Josiah", expected: "male" },
  { name: "Mila", expected: "female" },
  { name: "Xavier", expected: "male" },
  { name: "Nora", expected: "female" },
  { name: "Sebastian", expected: "male" },
  { name: "Savannah", expected: "female" },
  { name: "Jonathan", expected: "male" },
  { name: "Alyssa", expected: "female" },
  { name: "Benjamin", expected: "male" },
  { name: "Penelope", expected: "female" },
  { name: "Carter", expected: "male" },
  { name: "Aria", expected: "female" },
  { name: "Liam", expected: "male" },
  { name: "Addison", expected: "female" },
    // Add more test cases as needed
  ];

  let passMale = 0;
  let passFemale = 0;
  let passUnknown = 0;

  testData.forEach((data) => {
    const result = findGender(data.name);
    const status = result.gender === data.expected ? "PASS" : "FAIL";
    console.log(`${data.name}: Expected ${data.expected}, Got ${result.gender} - ${status}`);

    if (status === "PASS") {
      if (result.gender === "male") {
        passMale++;
      } else if (result.gender === "female") {
        passFemale++;
      } else {
        passUnknown++;
      }
    }
  });

  const totalTests = testData.length;
  const successRate = ((passMale + passFemale + passUnknown) / totalTests) * 100;

  console.log("\nFinal Results:");
  console.log(`Passed - Male: ${passMale}`);
  console.log(`Passed - Female: ${passFemale}`);
  console.log(`Passed - Unknown: ${passUnknown}`);
  console.log(`Success Rate: ${successRate.toFixed(2)}% - based on ${totalTests} names`);
}

// Run the test
testFindGender();

I was thinking maybe we could add this into Compromise.js as a fall back to classify names. I was also curious / still have to play with results for 3 characters. (maybe a double rule check / probability etc...)

Diminutive Names

I have a list of names such as "Jake":["Jacob", "Jack", "Jac"], unfortantely not sorted by most common name. But would be cool if we could have a function to normalize the name such as "Spence" to "Spencer".

Last Names

I have scraped a huge list of various family names (oddly even my last name was in there) - a small list / but again a vary non biased / very cultural list. (sorted by language)

You can view example of data that was scraped here - https://en.wikipedia.org/wiki/Category:Dutch-language_surnames

Let me know if this is use to you - if so I will add you to the repo.

Happy New Year

Hope all is well, I know I have some other random thoughts / resources etc. I wanted to share with you - been feeling crazy sick + burn out past few weeks but I will try to commit some useful things to wtf_wikipedia + compromise asap.

Cheers!

See comments below!

Updated as of 22nd Jan!

@MarketingPip
Copy link
Author

MarketingPip commented Jan 23, 2024

Another thing leaving here - just playing around tonight. Tho might help with name parser if we wrote this in Compromise rule set. (Maybe not useful at all but)..

Court Parser

const input1 = "U.S. v. Thompson";
const input2 = "R v Spencer";

function courtCaseParser(inputStr){
  const lastNameRegex = /(R\.? v\.?|R v|U\.?S\.? v\.?|U\.?S v|US v|Queen\.? v\.?|Queen v\.?|Queen vs|Queen v) ([A-Z][a-z]+(?:\s+[A-Z][a-z]+)* and [A-Z][a-z]+(?:\s+[A-Z][a-z]+)*|[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)/
const match1 = inputStr.match(lastNameRegex);


return match1 ? match1[2] : null;
//return lastName
}
console.log(courtCaseParser(input1)) // Thompson
console.log(courtCaseParser(input2)) // Spencer
console.log(courtCaseParser("hey man did you see the R. v. Van Earkle court case?")) // Van Earkle
console.log(courtCaseParser("No but I watched the US. v Van Payne case it was wild")) // Van Payne
console.log(courtCaseParser("R v Dudley and Stephens")) // Dudley and Stephens

console.log(courtCaseParser("What are the ethical issues in the Queen v Dudley and Stephens case?")) // Dudley and Stephens

console.log(courtCaseParser("Northern District of California | U.S. v. Elizabeth Holmes")) // Elizabeth Holmes

Updated version with test case:

const input1 = "U.S. v. Thompson";
const input2 = "R v Spencer";

function courtCaseParser(inputStr){
  const lastNameRegex = /(?:R\.?|U\.?S\.?|US|Queen|State|People)\s(?:v\.?s?|vs|v)\s([A-Z][a-z]+(?:\s+[A-Z][a-z]+)* and [A-Z][a-z]+(?:\s+[A-Z][a-z]+)*|[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)/
const match1 = inputStr.match(lastNameRegex);
//console.log(match1)

return match1 ? match1[1] : null;
//return lastName
}
function runTests() {
  const testCases = [
    { input: "R v Smith and Johnson", expected: "Smith and Johnson" },
    { input: "U.S. v Doe and Roe", expected: "Doe and Roe" },
    { input: "Queen v Johnson and Smith", expected: "Johnson and Smith" },
    { input: "R. v. Brown and White", expected: "Brown and White" },
    { input: "US v. Anderson and Davis", expected: "Anderson and Davis" },
    { input: "R v Taylor and Turner", expected: "Taylor and Turner" },
    { input: "U.S. v. Robinson and Clark", expected: "Robinson and Clark" },
    { input: "Queen vs Harris and Lewis", expected: "Harris and Lewis" },
    
{ input: "U.S. v. Jackson", expected: "Jackson" },
{ input: "R v. Miller", expected: "Miller" },
{ input: "People v. Sanchez", expected: "Sanchez" },
{ input: "State v. Thompson", expected: "Thompson" },
{ input: "U.S. v. Davis", expected: "Davis" },
{ input: "R v. Wilson", expected: "Wilson" },
{ input: "People v. Carter", expected: "Carter" },
{ input: "State v. Brown", expected: "Brown" },
 { input: "U.S. v. Turner", expected: "Turner" },
{ input: "R v. Harris", expected: "Harris" },
{ input: "People v. Rodriguez", expected: "Rodriguez" },
{ input: "State v. Miller", expected: "Miller" },
{ input: "U.S. v. Walker", expected: "Walker" },
{ input: "R v. Jenkins", expected: "Jenkins" },
{ input: "People v. Foster", expected: "Foster" },
{ input: "State v. King", expected: "King" },
{ input: "U.S. v. White", expected: "White" },
{ input: "R v. Martinez", expected: "Martinez" },
{ input: "People v. Adams", expected: "Adams" },
{ input: "State v. Scott", expected: "Scott" },
{ input: "U.S. v. Garcia", expected: "Garcia" },
{ input: "R v. Young", expected: "Young" },
{ input: "People v. Perez", expected: "Perez" },
{ input: "State v. Taylor", expected: "Taylor" },
{ input: "U.S. v. Mitchell", expected: "Mitchell" },
{ input: "R v. Robinson", expected: "Robinson" },
{ input: "People v. Wright", expected: "Wright" },
{ input: "State v. Turner", expected: "Turner" },
{ input: "U.S. v. Moore", expected: "Moore" },
{ input: "R v. Lewis", expected: "Lewis" },
{ input: "People v. Nelson", expected: "Nelson" },
{ input: "State v. Adams", expected: "Adams" }   
    // Add more test cases as needed
  ];

  testCases.forEach((testCase, index) => {
    const result = courtCaseParser(testCase.input);
    if (result === testCase.expected) {
      console.log(`Test case ${index + 1} passed, result: ${result}`);
    } else {
      console.error(`Test case ${index + 1} failed. Expected "${testCase.expected}", got "${result}"`);
    }
  });
}

// Run the tests
runTests();

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment