Created
February 9, 2019 21:07
-
-
Save Shritesh99/2e14cb8d87943de4562a62cfe505cc87 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Techinical Intern Basic Test - Imgtranslate.com | |
There are 3 zip files hosted at http://staging.imgtranslate.com:9999/ with names a.zip b.zip and c.zip. | |
They each contain text files with sentences on new lines. | |
You are required to write a program that: | |
1. Downloads them | |
2. Unzips them | |
3. Compares the sentences of each file with others | |
4. Only prints pairs of files that contain duplicate sentences | |
@author Shritesh Jamulkar | |
""" | |
# Importing necessary Libraries | |
import os | |
import io | |
import requests | |
import zipfile | |
import re | |
from bs4 import BeautifulSoup | |
from collections import Counter | |
# url | |
url = 'http://staging.imgtranslate.com:9999/' | |
# Web Scraping | |
soup = BeautifulSoup(requests.get(url).content, 'html.parser') | |
# Finding all zip links | |
zip_links = [anchor_tag.string for anchor_tag in soup.find_all("a") if re.match("[a-z].zip", str(anchor_tag.string))] | |
# Downloading all zip files | |
for file in zip_links: | |
request_zip = requests.get(url+file, stream=True) | |
zip_file = zipfile.ZipFile(io.BytesIO(request_zip.content)) | |
zip_file.extractall() | |
# Reading the contents of a directory | |
files = {} # Dictionary of all files | |
for i in range(1,len(os.listdir(os.path.join(os.getcwd(),"a")))+1): | |
files['a{}'.format(i)] = set(open("a/a{}.txt".format(i)).read().splitlines()) | |
# Reading the contents of b directory | |
for i in range(1,len(os.listdir(os.path.join(os.getcwd(),"b")))+1): | |
files['b{}'.format(i)] = set(open("b/b{}.txt".format(i)).read().splitlines()) | |
# Reading the contents of c directory | |
for i in range(1,len(os.listdir(os.path.join(os.getcwd(),"c")))+1): | |
files['c{}'.format(i)] = set(open("c/c{}.txt".format(i)).read().splitlines()) | |
# Itrateing over dictionary to find duplicates | |
all_sets = [] | |
for key1, value1 in files.items(): | |
for key2, value2 in files.items(): | |
if key1 is not key2: | |
duplicates = len(value1.intersection(value2)) | |
if duplicates >= 0: | |
all_sets.append([key1,key2]) | |
# Generating unique sets from all sets | |
ctr = Counter(frozenset(x) for x in all_sets) | |
unique_sets = [list(x) for x in ctr.keys()] | |
# Printing results | |
for i in unique_sets: | |
print("{}, {}".format(i[0], i[1])) | |
""" | |
Results:- | |
a1, a2 | |
a3, a1 | |
a1, b1 | |
a1, b2 | |
a1, b3 | |
a1, b4 | |
a1, c1 | |
a1, c2 | |
a3, a2 | |
a2, b1 | |
a2, b2 | |
b3, a2 | |
a2, b4 | |
a2, c1 | |
a2, c2 | |
a3, b1 | |
a3, b2 | |
a3, b3 | |
a3, b4 | |
a3, c1 | |
a3, c2 | |
b2, b1 | |
b3, b1 | |
b4, b1 | |
c1, b1 | |
c2, b1 | |
b3, b2 | |
b4, b2 | |
b2, c1 | |
c2, b2 | |
b3, b4 | |
b3, c1 | |
b3, c2 | |
b4, c1 | |
b4, c2 | |
c2, c1 | |
""" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment