Shritesh99/Assignment.py

## Assignment.py
"""
Techinical Intern Basic Test - Imgtranslate.com

There are 3 zip files hosted at http://staging.imgtranslate.com:9999/ with names a.zip b.zip and c.zip.
They each contain text files with sentences on new lines.

You are required to write a program that:

1. Downloads them
2. Unzips them
3. Compares the sentences of each file with others
4. Only prints pairs of files that contain duplicate sentences

@author Shritesh Jamulkar

"""
# Importing necessary Libraries

import os
import io
import requests
import zipfile
import re
from bs4 import BeautifulSoup
from collections import Counter

# url
url = 'http://staging.imgtranslate.com:9999/'

# Web Scraping
soup = BeautifulSoup(requests.get(url).content, 'html.parser')

# Finding all zip links
zip_links = [anchor_tag.string for anchor_tag in soup.find_all("a") if re.match("[a-z].zip", str(anchor_tag.string))]

# Downloading all zip files
for file in zip_links:
    request_zip = requests.get(url+file, stream=True)
    zip_file = zipfile.ZipFile(io.BytesIO(request_zip.content))
    zip_file.extractall()

# Reading the contents of a directory
files = {} # Dictionary of all files

for i in  range(1,len(os.listdir(os.path.join(os.getcwd(),"a")))+1):
    files['a{}'.format(i)] = set(open("a/a{}.txt".format(i)).read().splitlines())

# Reading the contents of b directory
for i in  range(1,len(os.listdir(os.path.join(os.getcwd(),"b")))+1):
    files['b{}'.format(i)] = set(open("b/b{}.txt".format(i)).read().splitlines())

# Reading the contents of c directory
for i in  range(1,len(os.listdir(os.path.join(os.getcwd(),"c")))+1):
    files['c{}'.format(i)] = set(open("c/c{}.txt".format(i)).read().splitlines())


# Itrateing over dictionary to find duplicates
all_sets = []
for key1, value1 in files.items():
    for key2, value2 in files.items():
        if key1 is not key2:
            duplicates = len(value1.intersection(value2))
            if duplicates >= 0:
                all_sets.append([key1,key2])

# Generating unique sets from all sets
ctr = Counter(frozenset(x) for x in all_sets)
unique_sets = [list(x) for x in ctr.keys()]

# Printing results
for i in unique_sets:
    print("{}, {}".format(i[0], i[1]))

"""
Results:-

a1, a2
a3, a1
a1, b1
a1, b2
a1, b3
a1, b4
a1, c1
a1, c2
a3, a2
a2, b1
a2, b2
b3, a2
a2, b4
a2, c1
a2, c2
a3, b1
a3, b2
a3, b3
a3, b4
a3, c1
a3, c2
b2, b1
b3, b1
b4, b1
c1, b1
c2, b1
b3, b2
b4, b2
b2, c1
c2, b2
b3, b4
b3, c1
b3, c2
b4, c1
b4, c2
c2, c1

"""
	"""
	Techinical Intern Basic Test - Imgtranslate.com

	There are 3 zip files hosted at http://staging.imgtranslate.com:9999/ with names a.zip b.zip and c.zip.
	They each contain text files with sentences on new lines.

	You are required to write a program that:

	1. Downloads them
	2. Unzips them
	3. Compares the sentences of each file with others
	4. Only prints pairs of files that contain duplicate sentences

	@author Shritesh Jamulkar

	"""
	# Importing necessary Libraries

	import os
	import io
	import requests
	import zipfile
	import re
	from bs4 import BeautifulSoup
	from collections import Counter

	# url
	url = 'http://staging.imgtranslate.com:9999/'

	# Web Scraping
	soup = BeautifulSoup(requests.get(url).content, 'html.parser')

	# Finding all zip links
	zip_links = [anchor_tag.string for anchor_tag in soup.find_all("a") if re.match("[a-z].zip", str(anchor_tag.string))]

	# Downloading all zip files
	for file in zip_links:
	request_zip = requests.get(url+file, stream=True)
	zip_file = zipfile.ZipFile(io.BytesIO(request_zip.content))
	zip_file.extractall()

	# Reading the contents of a directory
	files = {} # Dictionary of all files

	for i in range(1,len(os.listdir(os.path.join(os.getcwd(),"a")))+1):
	files['a{}'.format(i)] = set(open("a/a{}.txt".format(i)).read().splitlines())

	# Reading the contents of b directory
	for i in range(1,len(os.listdir(os.path.join(os.getcwd(),"b")))+1):
	files['b{}'.format(i)] = set(open("b/b{}.txt".format(i)).read().splitlines())

	# Reading the contents of c directory
	for i in range(1,len(os.listdir(os.path.join(os.getcwd(),"c")))+1):
	files['c{}'.format(i)] = set(open("c/c{}.txt".format(i)).read().splitlines())


	# Itrateing over dictionary to find duplicates
	all_sets = []
	for key1, value1 in files.items():
	for key2, value2 in files.items():
	if key1 is not key2:
	duplicates = len(value1.intersection(value2))
	if duplicates >= 0:
	all_sets.append([key1,key2])

	# Generating unique sets from all sets
	ctr = Counter(frozenset(x) for x in all_sets)
	unique_sets = [list(x) for x in ctr.keys()]

	# Printing results
	for i in unique_sets:
	print("{}, {}".format(i[0], i[1]))

	"""
	Results:-

	a1, a2
	a3, a1
	a1, b1
	a1, b2
	a1, b3
	a1, b4
	a1, c1
	a1, c2
	a3, a2
	a2, b1
	a2, b2
	b3, a2
	a2, b4
	a2, c1
	a2, c2
	a3, b1
	a3, b2
	a3, b3
	a3, b4
	a3, c1
	a3, c2
	b2, b1
	b3, b1
	b4, b1
	c1, b1
	c2, b1
	b3, b2
	b4, b2
	b2, c1
	c2, b2
	b3, b4
	b3, c1
	b3, c2
	b4, c1
	b4, c2
	c2, c1

	"""