haoliplus/capture-wechat.py

## capture-wechat.py
# update: 2019-5-11
import sys
import os
import time
from bs4 import BeautifulSoup
import requests
import pdfkit

def main(url):
    output_path = ...
    sys.path.append('/usr/local/bin/')
    r = requests.get(url)
    soup = BeautifulSoup(r.text, 'html.parser')
    title = soup.find('h2').text.strip()
    all_image = soup.find_all('img')
    for img in all_image:
        if img.has_attr('data-src'):
            img['src'] = img['data-src']
    host = 'mp.weixin.qq.com'
    pathname = url.replace('https://', '').replace(host, '')
    content = str(soup).replace('windows.location.href', url)\
                        .replace('location.href', url)\
                        .replace('windows.location.search', '')\
                        .replace('location.search', '')\
                        .replace('windows.location.protocol', 'https:')\
                        .replace('location.protocol', 'https:')\
                        .replace('windows.location.pathname', pathname)\
                        .replace('location.pathname', pathname)\
                        .replace('windows.location.host', host)\
                        .replace('location.host', host)\
                        .replace('"//res.wx.qq.com/', '"https://res.wx.qq.com/')
    path = '{}/{}.wechat.html'.format(output_path, title)
    open(path, 'w').write(content)
    print(title)

query_text = ''.join(sys.argv[1:])
main(query_text)
	# update: 2019-5-11
	import sys
	import os
	import time
	from bs4 import BeautifulSoup
	import requests
	import pdfkit

	def main(url):
	output_path = ...
	sys.path.append('/usr/local/bin/')
	r = requests.get(url)
	soup = BeautifulSoup(r.text, 'html.parser')
	title = soup.find('h2').text.strip()
	all_image = soup.find_all('img')
	for img in all_image:
	if img.has_attr('data-src'):
	img['src'] = img['data-src']
	host = 'mp.weixin.qq.com'
	pathname = url.replace('https://', '').replace(host, '')
	content = str(soup).replace('windows.location.href', url)\
	.replace('location.href', url)\
	.replace('windows.location.search', '')\
	.replace('location.search', '')\
	.replace('windows.location.protocol', 'https:')\
	.replace('location.protocol', 'https:')\
	.replace('windows.location.pathname', pathname)\
	.replace('location.pathname', pathname)\
	.replace('windows.location.host', host)\
	.replace('location.host', host)\
	.replace('"//res.wx.qq.com/', '"https://res.wx.qq.com/')
	path = '{}/{}.wechat.html'.format(output_path, title)
	open(path, 'w').write(content)
	print(title)

	query_text = ''.join(sys.argv[1:])
	main(query_text)