shawngraham/news-to-knowledge-graph.py

## news-to-knowledge-graph.py
import tkinter as tk
from tkinter import filedialog
from tkinter import ttk, filedialog
import pandas as pd
from newsapi import NewsApiClient
import llm
import requests
from strip_tags import strip_tags
model = llm.get_model("orca-mini-3b-gguf2-q4_0") #local model through llm plugin llm-gpt4all.

#model = llm.get_model("4t") #or use gpt modesl
#model.key = 'model api key here' #for which you'll need a key

# Initialize the NewsApiClient with a placeholder key
newsapi = NewsApiClient(api_key='api here')
# Add a global variable to hold the articles data
articles_data = None  # This will be populated with the articles data


#function to process text with llm
def llm_processing(content):
    # Process the content and return the result
    try:
        # prompt
        prompt_text = f"You are a knowledge graph assistant. Extract entities and predicates from the provided text:\n\n{content}\n\nReturn ONLY the triples formatted for cvs: entity1,predicate,entity2."

        # Generate a response from the model
        response = model.prompt(prompt_text)
        return response
    except Exception as e:
        # Handle any exceptions that occur during processing
        print(f"An error occurred: {e}")
        return None


# Function to get the news data

def get_news():
    global articles_data  # Declare articles_data as global to modify it

    query = query_entry.get()
    all_stories = newsapi.get_everything(q=query, language='en')
    articles = all_stories.get('articles', [])

    # Construct a data dictionary for DataFrame initialization
    data = {'title': [], 'content': [], 'url': [], 'llm': []}

    for article in articles:
        # Use the get method to avoid KeyError and provide a default value if the key is missing
        title = article.get('title', 'No Title Available')
        content = article.get('content', 'No Content Available')
        url = article.get('url', 'No Url Available')

        # Append the results to the data dictionary (llm is empty for now)
        data['title'].append(title)
        data['content'].append(content)
        data['url'].append(url)
        data['llm'].append('')  # Placeholder for llm result

    # Store the articles data for later processing
    articles_data = pd.DataFrame(data)

    # Show the DataFrame with empty llm column
    show_dataframe(articles_data)

    # Enable the process button
    process_button.config(state='normal')


# Function to retrieve HTML and process it
def process_article_url(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an error for bad status

        # Use strip_tags to clean up the HTML
        html_content = strip_tags(
            response.text,
            ["div"],  # Exclude <div> tags
            minify=True,               # Minify the html to remove extra spaces and new lines
            keep_tags=["h1"]       # Keep <h1> tags
        )

        # Once we get the cleaned text, we can process it through llm_processing
        return llm_processing(html_content)
    except requests.RequestException as e:
        print(f"An error occurred while fetching the article: {e}")
        return None

# Function to process news data with llm ; which works, but 'content' isn't very much

def process_news_with_llm():
    global articles_data

    if articles_data is not None:
        # Update the 'llm' column by processing each url with process_article_url
        articles_data['llm'] = articles_data['url'].apply(process_article_url)

        # Update the displayed DataFrame
        show_dataframe(articles_data)

        # Enable the export button and set the command to export the df to a file
        export_button.config(state='normal', command=lambda: export_to_file(articles_data))

# Function to export the dataframe to a file
def export_to_file(df):
    filename = filedialog.asksaveasfilename(defaultextension=".csv", filetypes=[("CSV files", "*.csv"), ("All files", "*.*")])
    if filename:
        df.to_csv(filename, sep='|', index=False)

# Set up the tkinter window
root = tk.Tk()
root.title('News API Interface')

# Add the entry widget for queries
query_entry = tk.Entry(root, width=50)
query_entry.pack()

# Add the get news button
get_news_button = tk.Button(root, text='Get Headlines', command=get_news)
get_news_button.pack()

# Add the process button to trigger llm processing
process_button = tk.Button(root, text='Process Articles with LLM', state='disabled', command=process_news_with_llm)
process_button.pack()

# Add the export button
export_button = tk.Button(root, text='Export', state='disabled')  # Initially disabled until news is fetched
export_button.pack()

# Create the label widget
top_left_label = tk.Label(root, text="News will load fast, if there is any. LLM will process slowly.", anchor="nw")

# Place the label at the top left using pack
top_left_label.pack(anchor="nw", pady=(5, 0), padx=(5, 0))

def update_treeview(df, treeview):
    # Clear current items in the treeview
    treeview.delete(*treeview.get_children())
    # Add new items to the treeview
    for _, row in df.iterrows():
        treeview.insert('', 'end', values=list(row))

# Create Treeview widget inside a frame for a scrollbar
tree_frame = tk.Frame(root)
tree_frame.pack()

# Add a scrollbar
tree_scroll = tk.Scrollbar(tree_frame)
tree_scroll.pack(side=tk.RIGHT, fill=tk.Y)

# Define the Treeview
tree = ttk.Treeview(tree_frame, yscrollcommand=tree_scroll.set, selectmode='browse')
tree.pack()

# Configure scrollbar
tree_scroll.config(command=tree.yview)

# Define our columns
tree['columns'] = ('Title', 'Content', 'URL', 'LLM Result')

# Format our columns
for col in tree['columns']:
    tree.column(col, anchor='w', width=240)
    tree.heading(col, text=col, anchor='w')

# Set the overall width of the Treeview widget
tree_width = 1200  # Calculate the total width based on individual column widths
tree.pack(fill='x', expand=True)  # Allow the treeview to expand and fill the x direction of its container

def show_dataframe(df):
    # Call the function to update the treeview with the new DataFrame
    update_treeview(df, tree)


# Run the application
root.mainloop()
	import tkinter as tk
	from tkinter import filedialog
	from tkinter import ttk, filedialog
	import pandas as pd
	from newsapi import NewsApiClient
	import llm
	import requests
	from strip_tags import strip_tags
	model = llm.get_model("orca-mini-3b-gguf2-q4_0") #local model through llm plugin llm-gpt4all.

	#model = llm.get_model("4t") #or use gpt modesl
	#model.key = 'model api key here' #for which you'll need a key

	# Initialize the NewsApiClient with a placeholder key
	newsapi = NewsApiClient(api_key='api here')
	# Add a global variable to hold the articles data
	articles_data = None # This will be populated with the articles data


	#function to process text with llm
	def llm_processing(content):
	# Process the content and return the result
	try:
	# prompt
	prompt_text = f"You are a knowledge graph assistant. Extract entities and predicates from the provided text:\n\n{content}\n\nReturn ONLY the triples formatted for cvs: entity1,predicate,entity2."

	# Generate a response from the model
	response = model.prompt(prompt_text)
	return response
	except Exception as e:
	# Handle any exceptions that occur during processing
	print(f"An error occurred: {e}")
	return None


	# Function to get the news data

	def get_news():
	global articles_data # Declare articles_data as global to modify it

	query = query_entry.get()
	all_stories = newsapi.get_everything(q=query, language='en')
	articles = all_stories.get('articles', [])

	# Construct a data dictionary for DataFrame initialization
	data = {'title': [], 'content': [], 'url': [], 'llm': []}

	for article in articles:
	# Use the get method to avoid KeyError and provide a default value if the key is missing
	title = article.get('title', 'No Title Available')
	content = article.get('content', 'No Content Available')
	url = article.get('url', 'No Url Available')

	# Append the results to the data dictionary (llm is empty for now)
	data['title'].append(title)
	data['content'].append(content)
	data['url'].append(url)
	data['llm'].append('') # Placeholder for llm result

	# Store the articles data for later processing
	articles_data = pd.DataFrame(data)

	# Show the DataFrame with empty llm column
	show_dataframe(articles_data)

	# Enable the process button
	process_button.config(state='normal')


	# Function to retrieve HTML and process it
	def process_article_url(url):
	try:
	response = requests.get(url)
	response.raise_for_status() # Raise an error for bad status

	# Use strip_tags to clean up the HTML
	html_content = strip_tags(
	response.text,
	["div"], # Exclude <div> tags
	minify=True, # Minify the html to remove extra spaces and new lines
	keep_tags=["h1"] # Keep <h1> tags
	)

	# Once we get the cleaned text, we can process it through llm_processing
	return llm_processing(html_content)
	except requests.RequestException as e:
	print(f"An error occurred while fetching the article: {e}")
	return None

	# Function to process news data with llm ; which works, but 'content' isn't very much

	def process_news_with_llm():
	global articles_data

	if articles_data is not None:
	# Update the 'llm' column by processing each url with process_article_url
	articles_data['llm'] = articles_data['url'].apply(process_article_url)

	# Update the displayed DataFrame
	show_dataframe(articles_data)

	# Enable the export button and set the command to export the df to a file
	export_button.config(state='normal', command=lambda: export_to_file(articles_data))

	# Function to export the dataframe to a file
	def export_to_file(df):
	filename = filedialog.asksaveasfilename(defaultextension=".csv", filetypes=[("CSV files", ".csv"), ("All files", ".*")])
	if filename:
	df.to_csv(filename, sep='\|', index=False)

	# Set up the tkinter window
	root = tk.Tk()
	root.title('News API Interface')

	# Add the entry widget for queries
	query_entry = tk.Entry(root, width=50)
	query_entry.pack()

	# Add the get news button
	get_news_button = tk.Button(root, text='Get Headlines', command=get_news)
	get_news_button.pack()

	# Add the process button to trigger llm processing
	process_button = tk.Button(root, text='Process Articles with LLM', state='disabled', command=process_news_with_llm)
	process_button.pack()

	# Add the export button
	export_button = tk.Button(root, text='Export', state='disabled') # Initially disabled until news is fetched
	export_button.pack()

	# Create the label widget
	top_left_label = tk.Label(root, text="News will load fast, if there is any. LLM will process slowly.", anchor="nw")

	# Place the label at the top left using pack
	top_left_label.pack(anchor="nw", pady=(5, 0), padx=(5, 0))

	def update_treeview(df, treeview):
	# Clear current items in the treeview
	treeview.delete(*treeview.get_children())
	# Add new items to the treeview
	for _, row in df.iterrows():
	treeview.insert('', 'end', values=list(row))

	# Create Treeview widget inside a frame for a scrollbar
	tree_frame = tk.Frame(root)
	tree_frame.pack()

	# Add a scrollbar
	tree_scroll = tk.Scrollbar(tree_frame)
	tree_scroll.pack(side=tk.RIGHT, fill=tk.Y)

	# Define the Treeview
	tree = ttk.Treeview(tree_frame, yscrollcommand=tree_scroll.set, selectmode='browse')
	tree.pack()

	# Configure scrollbar
	tree_scroll.config(command=tree.yview)

	# Define our columns
	tree['columns'] = ('Title', 'Content', 'URL', 'LLM Result')

	# Format our columns
	for col in tree['columns']:
	tree.column(col, anchor='w', width=240)
	tree.heading(col, text=col, anchor='w')

	# Set the overall width of the Treeview widget
	tree_width = 1200 # Calculate the total width based on individual column widths
	tree.pack(fill='x', expand=True) # Allow the treeview to expand and fill the x direction of its container

	def show_dataframe(df):
	# Call the function to update the treeview with the new DataFrame
	update_treeview(df, tree)


	# Run the application
	root.mainloop()