Skip to content

Instantly share code, notes, and snippets.

View andrittt's full-sized avatar

anonymous_toolsForBigData andrittt

View GitHub Profile
@andrittt
andrittt / adjusted_rand_index.py
Last active December 9, 2017 11:12
Challange 3 - Code
truth = [
set(['DMDR1U2RA7VN', 'K29U1709EA5R', 'D3NAY0YYFO4P', '58D4CGTDM5VX', 'ZLRB9DMOYSM9', 'J27VW94YYJRP', '77FOA4UNWD8Y', 'W0JQH817T6IE', 'OTXGMC3STDZ7', 'F4R4MW6W1BO8']),
set(['NY0XRPCQX2J6', '5B15T46T75XM', 'QKPLUGBHWX1S', '90BP7NQLOZI8', 'H3ETKWH70OZ0', 'BWWQDUXMWDTU', '0J5OWQRLV2ZF', 'D0K9L1DTG1EQ', 'SRXWGC3XXJJO', '148X2AS0P7MP']),
set(['YS0M2FXHFUKK', 'KASAZL3RPKK6', 'ZILSSCBC40IR', 'NEFEWA5CEPMW', '8DGQWN7D24RW', 'G1FQA6E96794', 'XNP69S9V9849', 'X5YBR7LX367U', '7INXG6910I57', 'W6G19WDE9FBN']),
set(['0TIBYZMOJD10', '3QBNSX4XCPSA', 'X3NC9RI7ZPUK', 'FRVXUX3X2S3R', 'V9GUVOSSR83H', '9ED47BUW3J9B', '1RY6YNAXRI7X', 'VWQTW530L7HU', 'MBA1GBU5A3MJ', 'FQR5NJPRAQ1T']),
set(['27BMODQ3KSDY', '2WRJA9D9SEPC', 'Q6RVWKG553K7', '8S46FET9O2Y1', 'AG7PEPJHIALE', 'WJ9Y2OG0EKR7', 'PLXC6ZHQIVVA', 'YRTYMIDTOV1R', '2DM3J4TN9557', 'LBVFSL8OUUHG']),
set(['L1EYAG4PN55N', 'WXA3PLRSG53G', '74SBBUUA94N3', 'AQ6XWF6SZZ3K', 'B45DHKLKJDYD', '5OM79AIPHX6W', 'ELVYERD2OSIT', '21USARENDKEH', 'VBEY9RLYA5IF', 'MZMYC75VUQCA']),
set(['
@andrittt
andrittt / ex2.py
Last active November 20, 2017 15:54
Tools For Big Data -- Week 7
from __future__ import division
import json
import time
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier
@andrittt
andrittt / ex2.py
Last active November 20, 2017 15:55
Tools For Big Data -- Week 7
import cv2 # for this I needed to install opencv -> pip install opencv-python
import numpy as np
def getHashLshAlgorithm(img):
# Resize to 9x8 pixels
img = cv2.resize(img,(9,8))
# Compare adjacent values (x>y)
img_compared = np.empty((8,8))
for i,row in enumerate(img):
@andrittt
andrittt / cat.png
Last active November 20, 2017 15:54
Tools For Big Data -- Week 7 -- Cat Figures
cat.png
@andrittt
andrittt / ex3.py
Created November 13, 2017 17:32
Challenge 2 - Exercise 3
from __future__ import division,print_function
import sqlite3
import time
import heapq
from multiprocessing import Process, Pool
# get connection to the database
con = sqlite3.connect(r"C:\nice\simple\path\reddit.db")
con.text_factory = str
@andrittt
andrittt / ex2.py
Last active November 13, 2017 17:34
Challenge 2 - Exercise 2
from __future__ import division,print_function
import sqlite3
import time
import heapq
from multiprocessing import Process, Pool
from itertools import combinations
from collections import defaultdict
from itertools import combinations
import operator
@andrittt
andrittt / ex1.py
Last active November 13, 2017 17:28
Challenge 2 - Exercise 1
from __future__ import print_function
import sqlite3
import time
from multiprocessing import Process, Pool
import string
import heapq
# Create a connection to the database
con = sqlite3.connect('/Users/some/cool/path/reddit.db')
@andrittt
andrittt / mr_euler_graphs.py
Last active October 29, 2017 12:10
The python script, a test file and a result file for the Exercise 7.2 in tools for big data
#!/usr/bin/env python
from mrjob.job import MRJob
from mrjob.step import MRStep
import re
import sys
# The procedure is quite simple. In the mapper we simply add a degree for each node that appears in every edge.
# we then use one reducer to sum up the occurance of each node which is the same as the degree of the node.
# we then add 0 to the degree_array if the node has an even degree number else we add 1 to the degree array.
@andrittt
andrittt / mr_word_occurences.py
Last active October 29, 2017 12:10
The python script, a test file and a result file for the Exercise 7.1 in tools for big data
#!/usr/bin/env python
from mrjob.job import MRJob
import re
import sys
# https://docs.python.org/3/library/re.html#re.compile
# explenations about re.compile can be found in the above link
WORD_RE = re.compile(r"[\w']+")
@andrittt
andrittt / nr_min_sale.py
Last active October 29, 2017 11:00
The python script, a test file and a result file for the Exercise 7.2 in tools for big data
from mrjob.job import MRJob
from mrjob.step import MRStep
class MinSale(MRJob):
def mapping(self, _, line):
data = line.strip().split(",")
date, time, store, item, cost, payment = data
yield (store,time), int(cost)