Skip to content

Instantly share code, notes, and snippets.

@sfboss
Created April 21, 2023 23:42
Show Gist options
  • Save sfboss/5fbcd511a20d38264b8766ebe8f92e6d to your computer and use it in GitHub Desktop.
Save sfboss/5fbcd511a20d38264b8766ebe8f92e6d to your computer and use it in GitHub Desktop.
This is a python script for getting named entities from a Salesforce Objects field data
import subprocess
import pandas as pd
import json
import nltk
def get_ners(txt):
from nltk import Tree
from nltk.chunk import ne_chunk
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
text = word_tokenize(txt)
pos_tagged = nltk.pos_tag(text)
chunked = nltk.ne_chunk(pos_tagged)
continuous_chunk = []
current_chunk = []
named_entity = ""
for i in chunked:
if type(i) == Tree:
current_chunk.append(
" ".join([token for token, pos in i.leaves()]))
elif current_chunk:
named_entity = " ".join(current_chunk)
if named_entity not in continuous_chunk:
continuous_chunk.append(named_entity)
current_chunk = []
else:
continue
# get the values that are NNS tagged
# loop thru pos_tagged and print the word and tag
for word, tag in pos_tagged:
print(word, tag)
nouns = [word for word, pos in pos_tagged if pos == 'NN' or pos == 'NNS' or pos == 'NNP' or pos == 'NNPS']
# create a list of unique values
unique_nouns = list(set(nouns))
# create a dictionary with the unique values as keys and the count of each value as the value
nouns_dict = {x: nouns.count(x) for x in unique_nouns}
# create a DataFrame from the dictionary
nouns_df = pd.DataFrame.from_dict(
nouns_dict, orient='index', columns=['Count'])
# sort the DataFrame by the Count column
nouns_df.sort_values(by=['Count'], inplace=True, ascending=False)
return nouns_df
# Replace this with your Salesforce Org alias
org_alias = 'your-org-alias'
# Query the Contact object for Id, Name, and Favorite_Food__c
query = "SELECT Id, Name, Favorite_Food__c FROM Contact"
cmd = f'sfdx force:data:soql:query --query "{query}" --targetusername {org_alias} --json'
# Run the sfdx command and capture the output
sfdx_output = subprocess.check_output(cmd, shell=True, text=True)
# Parse the JSON output and create a DataFrame
sfdx_json = json.loads(sfdx_output)
records = sfdx_json['result']['records']
df = pd.DataFrame(records)
# create a string with the values from the Favorite_Food__c column
favorite_foods = df['Favorite_Food__c'].str.cat(sep=' ')
theners = (get_ners(favorite_foods))
print(theners)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment