Created
April 21, 2023 23:42
-
-
Save sfboss/5fbcd511a20d38264b8766ebe8f92e6d to your computer and use it in GitHub Desktop.
This is a python script for getting named entities from a Salesforce Objects field data
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import subprocess | |
import pandas as pd | |
import json | |
import nltk | |
def get_ners(txt): | |
from nltk import Tree | |
from nltk.chunk import ne_chunk | |
from nltk.tag import pos_tag | |
from nltk.tokenize import word_tokenize | |
text = word_tokenize(txt) | |
pos_tagged = nltk.pos_tag(text) | |
chunked = nltk.ne_chunk(pos_tagged) | |
continuous_chunk = [] | |
current_chunk = [] | |
named_entity = "" | |
for i in chunked: | |
if type(i) == Tree: | |
current_chunk.append( | |
" ".join([token for token, pos in i.leaves()])) | |
elif current_chunk: | |
named_entity = " ".join(current_chunk) | |
if named_entity not in continuous_chunk: | |
continuous_chunk.append(named_entity) | |
current_chunk = [] | |
else: | |
continue | |
# get the values that are NNS tagged | |
# loop thru pos_tagged and print the word and tag | |
for word, tag in pos_tagged: | |
print(word, tag) | |
nouns = [word for word, pos in pos_tagged if pos == 'NN' or pos == 'NNS' or pos == 'NNP' or pos == 'NNPS'] | |
# create a list of unique values | |
unique_nouns = list(set(nouns)) | |
# create a dictionary with the unique values as keys and the count of each value as the value | |
nouns_dict = {x: nouns.count(x) for x in unique_nouns} | |
# create a DataFrame from the dictionary | |
nouns_df = pd.DataFrame.from_dict( | |
nouns_dict, orient='index', columns=['Count']) | |
# sort the DataFrame by the Count column | |
nouns_df.sort_values(by=['Count'], inplace=True, ascending=False) | |
return nouns_df | |
# Replace this with your Salesforce Org alias | |
org_alias = 'your-org-alias' | |
# Query the Contact object for Id, Name, and Favorite_Food__c | |
query = "SELECT Id, Name, Favorite_Food__c FROM Contact" | |
cmd = f'sfdx force:data:soql:query --query "{query}" --targetusername {org_alias} --json' | |
# Run the sfdx command and capture the output | |
sfdx_output = subprocess.check_output(cmd, shell=True, text=True) | |
# Parse the JSON output and create a DataFrame | |
sfdx_json = json.loads(sfdx_output) | |
records = sfdx_json['result']['records'] | |
df = pd.DataFrame(records) | |
# create a string with the values from the Favorite_Food__c column | |
favorite_foods = df['Favorite_Food__c'].str.cat(sep=' ') | |
theners = (get_ners(favorite_foods)) | |
print(theners) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment