Skip to content

Instantly share code, notes, and snippets.

@RichardBronosky
Last active November 8, 2023 13:24
Show Gist options
  • Star 5 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save RichardBronosky/4060082 to your computer and use it in GitHub Desktop.
Save RichardBronosky/4060082 to your computer and use it in GitHub Desktop.
<tr>
<td class="pos">\n
"Some text:"\n
<br>\n
<strong>some value</strong>\n
</td>
</tr>
<tr>
<td class="pos">\n
"Fixed text:"\n
<br>\n
<strong>text I am looking for</strong>\n
</td>
</tr>
<tr>
<td class="pos">\n
"Some other text:"\n
<br>\n
<strong>some other value</strong>\n
</td>
</tr>
# Taken from https://gist.github.com/4060082
# If you have BeautifulSoup, you can test this locally via:
# curl https://gist.githubusercontent.com/RichardBronosky/4060082/raw/test.py | python
from BeautifulSoup import BeautifulSoup
from urllib2 import urlopen
from pprint import pprint
import re
soup = BeautifulSoup(urlopen('https://gist.githubusercontent.com/RichardBronosky/4060082/raw/test.html').read())
# I'm going to assume that Peter knew that re.compile is meant to cache a computation result for a performance benefit. However, I'm going to do that explicitly here to be very clear.
pattern = re.compile('Fixed text')
# Peter's suggestion here returns a list of what appear to be strings
columns = soup.findAll('td', text=pattern, attrs={'class' : 'pos'})
# ...but it is actually a BeautifulSoup.NavigableString
print type(columns[0])
#>> <class 'BeautifulSoup.NavigableString'>
# you can reach the tag using one of the convenience attributes seen here
pprint(columns[0].__dict__)
#>> {'next': <br />,
#>> 'nextSibling': <br />,
#>> 'parent': <td class="pos">\n
#>> "Fixed text:"\n
#>> <br />\n
#>> <strong>text I am looking for</strong>\n
#>> </td>,
#>> 'previous': <td class="pos">\n
#>> "Fixed text:"\n
#>> <br />\n
#>> <strong>text I am looking for</strong>\n
#>> </td>,
#>> 'previousSibling': None}
# I feel that 'parent' is safer to use than 'previous' based on http://www.crummy.com/software/BeautifulSoup/bs4/doc/#method-names
# So, if you want to find the 'text' in the 'strong' element...
pprint([t.parent.find('strong').text for t in soup.findAll('td', text=pattern, attrs={'class' : 'pos'})])
#>> [u'text I am looking for']
# Here is what we have learned:
print soup.find('strong')
#>> <strong>some value</strong>
print soup.find('strong', text='some value')
#>> u'some value'
print soup.find('strong', text='some value').parent
#>> <strong>some value</strong>
print soup.find('strong', text='some value') == soup.find('strong')
#>> False
print soup.find('strong', text='some value') == soup.find('strong').text
#>> True
print soup.find('strong', text='some value').parent == soup.find('strong')
#>> True
@shadowruge
Copy link

update python 3.12 im debian 12

from bs4 import BeautifulSoup
import urllib.request
import re

url = ''

Use contextlib to safely close the connection

with urllib.request.urlopen(url) as response:
soup = BeautifulSoup(response, 'html.parser')

pattern = re.compile('Fixed text')
columns = soup.find_all('td', string=pattern, class_='pos')

if len(columns) > 0:
print(type(columns[0]))
else:
print("The 'columns' list is empty.")

print([t.parent.find('strong').text for t in columns])

print(soup.find('strong'))

print(soup.find('strong', string='some value'))

if soup.find('strong', string='some value'):
print(soup.find('strong', string='some value').parent)

if soup.find('strong', string='some value') == soup.find('strong'):
print("Both elements are the same.")

if soup.find('strong', string='some value').parent == soup.find('strong'):
print("Parent of 'some value' is the same as 'strong'.")

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment