Skip to content

Instantly share code, notes, and snippets.

@edemnati
Last active August 19, 2023 18:36
Show Gist options
  • Save edemnati/add2dc73ee0ddb376f0527020614d66e to your computer and use it in GitHub Desktop.
Save edemnati/add2dc73ee0ddb376f0527020614d66e to your computer and use it in GitHub Desktop.
import requests
from bs4 import BeautifulSoup
import pandas as pd
#Test url
url = 'https://www.theguardian.com/technology/2019/aug/28/apple-ends-contracts-hundreds-workers-hired-to-listen-siri'
#Request the article url to get the web page content.
article = requests.get(url)
# 1. extract all paragraph elements inside the page body
articles = BeautifulSoup(article.content, 'html.parser')
articles_body = articles.findAll('body')
p_blocks = articles_body[0].findAll('p')
# 2. for each paragraph, construct its patents elements hierarchy
#Create a dataframe to collect p_blocks data
p_blocks_df=pd.DataFrame(columns=['element_name','parent_hierarchy','element_text','element_text_Count'])
for i in range(0,len(p_blocks)):
# 2.1 Loop trough paragraph parents to extract its element name and id
parents_list=[]
for parent in p_blocks[i].parents:
#Extract the parent id attribute if it exists
Parent_id = ''
try:
Parent_id = parent['id']
except:
pass
# Append the parent name and id to the parents table
parents_list.append(parent.name + 'id: ' + Parent_id)
# 2.2 Construct parents hierarchy
parent_element_list = ['' if (x == 'None' or x is None) else x for x in parents_list ]
parent_element_list.reverse()
parent_hierarchy = ' -> '.join(parent_element_list)
#Append data table with the current paragraph data
p_blocks_df=p_blocks_df.append({"element_name":p_blocks[i].name
,"parent_hierarchy":parent_hierarchy
,"element_text":p_blocks[i].text
,"element_text_Count":len(str(p_blocks[i].text))}
,ignore_index=True
,sort=False)
# 3. concatenate paragraphs under the same parent hierarchy
if len(p_blocks_df)>0:
p_blocks_df_groupby_parent_hierarchy=p_blocks_df.groupby(by=['parent_hierarchy'])
p_blocks_df_groupby_parent_hierarchy_sum=p_blocks_df_groupby_parent_hierarchy[['element_text_Count']].sum()
p_blocks_df_groupby_parent_hierarchy_sum.reset_index(inplace=True)
# 4. count paragraphs length
# 5. select the longest paragraph as the main article
maxid=p_blocks_df_groupby_parent_hierarchy_sum.loc[p_blocks_df_groupby_parent_hierarchy_sum['element_text_Count'].idxmax()
,'parent_hierarchy']
merge_text='\n'.join(p_blocks_df.loc[p_blocks_df['parent_hierarchy']==maxid,'element_text'].to_list())
@lcatbt
Copy link

lcatbt commented Jun 9, 2023

It doesnt work with url = df_news_feed.link[0]

Why not and how do I have to change the code?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment