Last active
August 19, 2023 18:36
-
-
Save edemnati/add2dc73ee0ddb376f0527020614d66e to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from bs4 import BeautifulSoup | |
import pandas as pd | |
#Test url | |
url = 'https://www.theguardian.com/technology/2019/aug/28/apple-ends-contracts-hundreds-workers-hired-to-listen-siri' | |
#Request the article url to get the web page content. | |
article = requests.get(url) | |
# 1. extract all paragraph elements inside the page body | |
articles = BeautifulSoup(article.content, 'html.parser') | |
articles_body = articles.findAll('body') | |
p_blocks = articles_body[0].findAll('p') | |
# 2. for each paragraph, construct its patents elements hierarchy | |
#Create a dataframe to collect p_blocks data | |
p_blocks_df=pd.DataFrame(columns=['element_name','parent_hierarchy','element_text','element_text_Count']) | |
for i in range(0,len(p_blocks)): | |
# 2.1 Loop trough paragraph parents to extract its element name and id | |
parents_list=[] | |
for parent in p_blocks[i].parents: | |
#Extract the parent id attribute if it exists | |
Parent_id = '' | |
try: | |
Parent_id = parent['id'] | |
except: | |
pass | |
# Append the parent name and id to the parents table | |
parents_list.append(parent.name + 'id: ' + Parent_id) | |
# 2.2 Construct parents hierarchy | |
parent_element_list = ['' if (x == 'None' or x is None) else x for x in parents_list ] | |
parent_element_list.reverse() | |
parent_hierarchy = ' -> '.join(parent_element_list) | |
#Append data table with the current paragraph data | |
p_blocks_df=p_blocks_df.append({"element_name":p_blocks[i].name | |
,"parent_hierarchy":parent_hierarchy | |
,"element_text":p_blocks[i].text | |
,"element_text_Count":len(str(p_blocks[i].text))} | |
,ignore_index=True | |
,sort=False) | |
# 3. concatenate paragraphs under the same parent hierarchy | |
if len(p_blocks_df)>0: | |
p_blocks_df_groupby_parent_hierarchy=p_blocks_df.groupby(by=['parent_hierarchy']) | |
p_blocks_df_groupby_parent_hierarchy_sum=p_blocks_df_groupby_parent_hierarchy[['element_text_Count']].sum() | |
p_blocks_df_groupby_parent_hierarchy_sum.reset_index(inplace=True) | |
# 4. count paragraphs length | |
# 5. select the longest paragraph as the main article | |
maxid=p_blocks_df_groupby_parent_hierarchy_sum.loc[p_blocks_df_groupby_parent_hierarchy_sum['element_text_Count'].idxmax() | |
,'parent_hierarchy'] | |
merge_text='\n'.join(p_blocks_df.loc[p_blocks_df['parent_hierarchy']==maxid,'element_text'].to_list()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
It doesnt work with url = df_news_feed.link[0]
Why not and how do I have to change the code?