libbymiller/tumblr_soup.py

## tumblr_soup.py
from pathlib import Path
import re
from bs4 import BeautifulSoup

strs = []
path = "html"
mydir = Path(path)
soup_arr = []

soup2 = BeautifulSoup("""
        <!DOCTYPE HTML>
        <html>
            <head>
                <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
                <link rel="stylesheet" type="text/css" href="style.css"/>
            </head>
            <body>
""")

for file in sorted(mydir.glob('*.html')):
    #print(file.name)
    fn = path+"/"+file.name
    with open(fn, 'r') as f:
      soup = BeautifulSoup(f)

      # add it all in
      new_group = soup2.new_tag("div id=\""+file.name+"\"")
      new_body = soup.find('body')
      new_body.insert_after(new_group)

      caption_div = soup.body.div
      caption = None

      if(caption_div):
        if(caption_div.attrs.get("class")):
           if("caption" in caption_div.attrs.get("class")):
              caption = caption_div
      if(caption == None):
        caption=soup2.new_tag("div")

      ts = soup.find_all('span')
      tts = None
      if(ts):
        tts = ts[0]
      else:
        tts = soup2.new_tag("span")

      embed = soup.body.embed
      vid = None
      if(embed):
        src = embed['src']
        src2 = re.sub(r"../../", "", src)
        src2 = re.sub(r"mp4", ".mp4", src2)
        vid = soup2.new_tag("video",src=src2,autoplay='false',controls='true')
      else:
        vid = soup2.new_tag("span")

      imgs = soup.find_all('img')

# add in order
      soup2.append(caption)
      soup2.append(tts)
      soup2.append(vid)
      if(imgs):
        for img in imgs:
          src = img['src']
          src2 = re.sub(r"../../", "", src)
          i = soup2.new_tag("img",src=src2)
          soup2.append(i)

# we don't need figures any more
for tag in soup2.find_all('figure'):
    tag.decompose()

print(soup2.prettify())
	from pathlib import Path
	import re
	from bs4 import BeautifulSoup

	strs = []
	path = "html"
	mydir = Path(path)
	soup_arr = []

	soup2 = BeautifulSoup("""
	<!DOCTYPE HTML>
	<html>
	<head>
	<meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
	<link rel="stylesheet" type="text/css" href="style.css"/>
	</head>
	<body>
	""")

	for file in sorted(mydir.glob('*.html')):
	#print(file.name)
	fn = path+"/"+file.name
	with open(fn, 'r') as f:
	soup = BeautifulSoup(f)

	# add it all in
	new_group = soup2.new_tag("div id=\""+file.name+"\"")
	new_body = soup.find('body')
	new_body.insert_after(new_group)

	caption_div = soup.body.div
	caption = None

	if(caption_div):
	if(caption_div.attrs.get("class")):
	if("caption" in caption_div.attrs.get("class")):
	caption = caption_div
	if(caption == None):
	caption=soup2.new_tag("div")

	ts = soup.find_all('span')
	tts = None
	if(ts):
	tts = ts[0]
	else:
	tts = soup2.new_tag("span")

	embed = soup.body.embed
	vid = None
	if(embed):
	src = embed['src']
	src2 = re.sub(r"../../", "", src)
	src2 = re.sub(r"mp4", ".mp4", src2)
	vid = soup2.new_tag("video",src=src2,autoplay='false',controls='true')
	else:
	vid = soup2.new_tag("span")

	imgs = soup.find_all('img')

	# add in order
	soup2.append(caption)
	soup2.append(tts)
	soup2.append(vid)
	if(imgs):
	for img in imgs:
	src = img['src']
	src2 = re.sub(r"../../", "", src)
	i = soup2.new_tag("img",src=src2)
	soup2.append(i)

	# we don't need figures any more
	for tag in soup2.find_all('figure'):
	tag.decompose()

	print(soup2.prettify())