Skip to content

Instantly share code, notes, and snippets.

@Shritesh99
Created February 9, 2019 21:07
Show Gist options
  • Save Shritesh99/2e14cb8d87943de4562a62cfe505cc87 to your computer and use it in GitHub Desktop.
Save Shritesh99/2e14cb8d87943de4562a62cfe505cc87 to your computer and use it in GitHub Desktop.
"""
Techinical Intern Basic Test - Imgtranslate.com
There are 3 zip files hosted at http://staging.imgtranslate.com:9999/ with names a.zip b.zip and c.zip.
They each contain text files with sentences on new lines.
You are required to write a program that:
1. Downloads them
2. Unzips them
3. Compares the sentences of each file with others
4. Only prints pairs of files that contain duplicate sentences
@author Shritesh Jamulkar
"""
# Importing necessary Libraries
import os
import io
import requests
import zipfile
import re
from bs4 import BeautifulSoup
from collections import Counter
# url
url = 'http://staging.imgtranslate.com:9999/'
# Web Scraping
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
# Finding all zip links
zip_links = [anchor_tag.string for anchor_tag in soup.find_all("a") if re.match("[a-z].zip", str(anchor_tag.string))]
# Downloading all zip files
for file in zip_links:
request_zip = requests.get(url+file, stream=True)
zip_file = zipfile.ZipFile(io.BytesIO(request_zip.content))
zip_file.extractall()
# Reading the contents of a directory
files = {} # Dictionary of all files
for i in range(1,len(os.listdir(os.path.join(os.getcwd(),"a")))+1):
files['a{}'.format(i)] = set(open("a/a{}.txt".format(i)).read().splitlines())
# Reading the contents of b directory
for i in range(1,len(os.listdir(os.path.join(os.getcwd(),"b")))+1):
files['b{}'.format(i)] = set(open("b/b{}.txt".format(i)).read().splitlines())
# Reading the contents of c directory
for i in range(1,len(os.listdir(os.path.join(os.getcwd(),"c")))+1):
files['c{}'.format(i)] = set(open("c/c{}.txt".format(i)).read().splitlines())
# Itrateing over dictionary to find duplicates
all_sets = []
for key1, value1 in files.items():
for key2, value2 in files.items():
if key1 is not key2:
duplicates = len(value1.intersection(value2))
if duplicates >= 0:
all_sets.append([key1,key2])
# Generating unique sets from all sets
ctr = Counter(frozenset(x) for x in all_sets)
unique_sets = [list(x) for x in ctr.keys()]
# Printing results
for i in unique_sets:
print("{}, {}".format(i[0], i[1]))
"""
Results:-
a1, a2
a3, a1
a1, b1
a1, b2
a1, b3
a1, b4
a1, c1
a1, c2
a3, a2
a2, b1
a2, b2
b3, a2
a2, b4
a2, c1
a2, c2
a3, b1
a3, b2
a3, b3
a3, b4
a3, c1
a3, c2
b2, b1
b3, b1
b4, b1
c1, b1
c2, b1
b3, b2
b4, b2
b2, c1
c2, b2
b3, b4
b3, c1
b3, c2
b4, c1
b4, c2
c2, c1
"""
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment