sfboss/picklist_ner.py

## picklist_ner.py
import subprocess
import pandas as pd
import json
import nltk

def get_ners(txt):
    from nltk import Tree
    from nltk.chunk import ne_chunk
    from nltk.tag import pos_tag
    from nltk.tokenize import word_tokenize
    text = word_tokenize(txt)
    pos_tagged = nltk.pos_tag(text)
    chunked = nltk.ne_chunk(pos_tagged)
    continuous_chunk = []
    current_chunk = []
    named_entity = ""
    for i in chunked:
        if type(i) == Tree:
            current_chunk.append(
                " ".join([token for token, pos in i.leaves()]))
        elif current_chunk:
            named_entity = " ".join(current_chunk)

        if named_entity not in continuous_chunk:
            continuous_chunk.append(named_entity)
            current_chunk = []
        else:
            continue

        # get the values that are NNS tagged
    # loop thru pos_tagged and print the word and tag
    for word, tag in pos_tagged:
        print(word, tag)
    nouns = [word for word, pos in pos_tagged if pos == 'NN' or pos == 'NNS' or pos == 'NNP' or pos == 'NNPS']
    # create a list of unique values
    unique_nouns = list(set(nouns))
    # create a dictionary with the unique values as keys and the count of each value as the value
    nouns_dict = {x: nouns.count(x) for x in unique_nouns}
    # create a DataFrame from the dictionary
    nouns_df = pd.DataFrame.from_dict(
        nouns_dict, orient='index', columns=['Count'])
    # sort the DataFrame by the Count column
    nouns_df.sort_values(by=['Count'], inplace=True, ascending=False)
    return nouns_df

# Replace this with your Salesforce Org alias
org_alias = 'your-org-alias'
# Query the Contact object for Id, Name, and Favorite_Food__c
query = "SELECT Id, Name, Favorite_Food__c FROM Contact"
cmd = f'sfdx force:data:soql:query --query "{query}" --targetusername {org_alias} --json'
# Run the sfdx command and capture the output
sfdx_output = subprocess.check_output(cmd, shell=True, text=True)
# Parse the JSON output and create a DataFrame
sfdx_json = json.loads(sfdx_output)
records = sfdx_json['result']['records']
df = pd.DataFrame(records)
# create a string with the values from the Favorite_Food__c column
favorite_foods = df['Favorite_Food__c'].str.cat(sep=' ')
theners = (get_ners(favorite_foods))
print(theners)
	import subprocess
	import pandas as pd
	import json
	import nltk

	def get_ners(txt):
	from nltk import Tree
	from nltk.chunk import ne_chunk
	from nltk.tag import pos_tag
	from nltk.tokenize import word_tokenize
	text = word_tokenize(txt)
	pos_tagged = nltk.pos_tag(text)
	chunked = nltk.ne_chunk(pos_tagged)
	continuous_chunk = []
	current_chunk = []
	named_entity = ""
	for i in chunked:
	if type(i) == Tree:
	current_chunk.append(
	" ".join([token for token, pos in i.leaves()]))
	elif current_chunk:
	named_entity = " ".join(current_chunk)

	if named_entity not in continuous_chunk:
	continuous_chunk.append(named_entity)
	current_chunk = []
	else:
	continue

	# get the values that are NNS tagged
	# loop thru pos_tagged and print the word and tag
	for word, tag in pos_tagged:
	print(word, tag)
	nouns = [word for word, pos in pos_tagged if pos == 'NN' or pos == 'NNS' or pos == 'NNP' or pos == 'NNPS']
	# create a list of unique values
	unique_nouns = list(set(nouns))
	# create a dictionary with the unique values as keys and the count of each value as the value
	nouns_dict = {x: nouns.count(x) for x in unique_nouns}
	# create a DataFrame from the dictionary
	nouns_df = pd.DataFrame.from_dict(
	nouns_dict, orient='index', columns=['Count'])
	# sort the DataFrame by the Count column
	nouns_df.sort_values(by=['Count'], inplace=True, ascending=False)
	return nouns_df

	# Replace this with your Salesforce Org alias
	org_alias = 'your-org-alias'
	# Query the Contact object for Id, Name, and Favorite_Food__c
	query = "SELECT Id, Name, Favorite_Food__c FROM Contact"
	cmd = f'sfdx force:data:soql:query --query "{query}" --targetusername {org_alias} --json'
	# Run the sfdx command and capture the output
	sfdx_output = subprocess.check_output(cmd, shell=True, text=True)
	# Parse the JSON output and create a DataFrame
	sfdx_json = json.loads(sfdx_output)
	records = sfdx_json['result']['records']
	df = pd.DataFrame(records)
	# create a string with the values from the Favorite_Food__c column
	favorite_foods = df['Favorite_Food__c'].str.cat(sep=' ')
	theners = (get_ners(favorite_foods))
	print(theners)