robla/striptrackers.py

## striptrackers.py
#!/usr/bin/env python
#
# Copyright 2016 Rob Lanphier
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import argparse
from bs4 import BeautifulSoup


def main():
    parser = argparse.ArgumentParser(
        description='Strip tracking URLs from file, print on stdout')

    parser.add_argument('htmlfile')
    args = parser.parse_args()
    htmlhandle = open(args.htmlfile)
    soup = BeautifulSoup(htmlhandle)
    for link in soup.find_all('a'):
        import urllib.parse
        url = link.get('href')
        urlobj = urllib.parse.urlparse(url)
        if(urlobj.netloc == 'www.google.com'):
            q = urllib.parse.parse_qs(urlobj.query)['q'][0]
            if(q):
                link['href']=q
    print(soup.prettify())

if __name__ == "__main__":
    main()
	#!/usr/bin/env python
	#
	# Copyright 2016 Rob Lanphier
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	import argparse
	from bs4 import BeautifulSoup


	def main():
	parser = argparse.ArgumentParser(
	description='Strip tracking URLs from file, print on stdout')

	parser.add_argument('htmlfile')
	args = parser.parse_args()
	htmlhandle = open(args.htmlfile)
	soup = BeautifulSoup(htmlhandle)
	for link in soup.find_all('a'):
	import urllib.parse
	url = link.get('href')
	urlobj = urllib.parse.urlparse(url)
	if(urlobj.netloc == 'www.google.com'):
	q = urllib.parse.parse_qs(urlobj.query)['q'][0]
	if(q):
	link['href']=q
	print(soup.prettify())

	if __name__ == "__main__":
	main()