anonymous_toolsForBigData andrittt

## adjusted_rand_index.py
truth = [
set(['DMDR1U2RA7VN', 'K29U1709EA5R', 'D3NAY0YYFO4P', '58D4CGTDM5VX', 'ZLRB9DMOYSM9', 'J27VW94YYJRP', '77FOA4UNWD8Y', 'W0JQH817T6IE', 'OTXGMC3STDZ7', 'F4R4MW6W1BO8']),
set(['NY0XRPCQX2J6', '5B15T46T75XM', 'QKPLUGBHWX1S', '90BP7NQLOZI8', 'H3ETKWH70OZ0', 'BWWQDUXMWDTU', '0J5OWQRLV2ZF', 'D0K9L1DTG1EQ', 'SRXWGC3XXJJO', '148X2AS0P7MP']),
set(['YS0M2FXHFUKK', 'KASAZL3RPKK6', 'ZILSSCBC40IR', 'NEFEWA5CEPMW', '8DGQWN7D24RW', 'G1FQA6E96794', 'XNP69S9V9849', 'X5YBR7LX367U', '7INXG6910I57', 'W6G19WDE9FBN']),
set(['0TIBYZMOJD10', '3QBNSX4XCPSA', 'X3NC9RI7ZPUK', 'FRVXUX3X2S3R', 'V9GUVOSSR83H', '9ED47BUW3J9B', '1RY6YNAXRI7X', 'VWQTW530L7HU', 'MBA1GBU5A3MJ', 'FQR5NJPRAQ1T']),
set(['27BMODQ3KSDY', '2WRJA9D9SEPC', 'Q6RVWKG553K7', '8S46FET9O2Y1', 'AG7PEPJHIALE', 'WJ9Y2OG0EKR7', 'PLXC6ZHQIVVA', 'YRTYMIDTOV1R', '2DM3J4TN9557', 'LBVFSL8OUUHG']),
set(['L1EYAG4PN55N', 'WXA3PLRSG53G', '74SBBUUA94N3', 'AQ6XWF6SZZ3K', 'B45DHKLKJDYD', '5OM79AIPHX6W', 'ELVYERD2OSIT', '21USARENDKEH', 'VBEY9RLYA5IF', 'MZMYC75VUQCA']),
set(['

## ex2.py
from __future__ import division
import json
import time
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier


## ex2.py
import cv2 # for this I needed to install opencv -> pip install opencv-python
import numpy as np

def getHashLshAlgorithm(img):
    # Resize to 9x8 pixels
    img = cv2.resize(img,(9,8))

    # Compare adjacent values (x>y)
    img_compared = np.empty((8,8))
    for i,row in enumerate(img):

## cat.png

      
              4 files
            
          
              0 forks
            
          
                0 comments
              
            
              0 stars
            
          
                andrittt
                / cat.png
            
            
              Last active
              November 20, 2017 15:54
            
              
                Tools For Big Data -- Week 7 -- Cat Figures
              
          
## ex3.py
from __future__ import division,print_function
import sqlite3
import time
import heapq
from multiprocessing import Process, Pool

# get connection to the database
con = sqlite3.connect(r"C:\nice\simple\path\reddit.db")
con.text_factory = str

## ex2.py
from __future__ import division,print_function
import sqlite3
import time
import heapq
from multiprocessing import Process, Pool
from itertools import combinations
from collections import defaultdict
from itertools import combinations
import operator

## ex1.py
from __future__ import print_function
import sqlite3
import time
from multiprocessing import Process, Pool
import string
import heapq


# Create a connection to the database
con = sqlite3.connect('/Users/some/cool/path/reddit.db')

## mr_euler_graphs.py
#!/usr/bin/env python

from mrjob.job import MRJob
from mrjob.step import MRStep
import re
import sys

# The procedure is quite simple. In the mapper we simply add a degree for each node that appears in every edge.
# we then use one reducer to sum up the occurance of each node which is the  same as the degree of the node.
# we then add 0 to the degree_array if the node has an even degree number else we add 1 to the degree array.

## mr_word_occurences.py
#!/usr/bin/env python

from mrjob.job import MRJob
import re
import sys

# https://docs.python.org/3/library/re.html#re.compile
# explenations about re.compile can be found in the above link

WORD_RE = re.compile(r"[\w']+")

## nr_min_sale.py
from mrjob.job import MRJob
from mrjob.step import MRStep

class MinSale(MRJob):

	def mapping(self, _, line):
		data = line.strip().split(",")
		date, time, store, item, cost, payment = data
		yield (store,time), int(cost)
	truth = [
	set(['DMDR1U2RA7VN', 'K29U1709EA5R', 'D3NAY0YYFO4P', '58D4CGTDM5VX', 'ZLRB9DMOYSM9', 'J27VW94YYJRP', '77FOA4UNWD8Y', 'W0JQH817T6IE', 'OTXGMC3STDZ7', 'F4R4MW6W1BO8']),
	set(['NY0XRPCQX2J6', '5B15T46T75XM', 'QKPLUGBHWX1S', '90BP7NQLOZI8', 'H3ETKWH70OZ0', 'BWWQDUXMWDTU', '0J5OWQRLV2ZF', 'D0K9L1DTG1EQ', 'SRXWGC3XXJJO', '148X2AS0P7MP']),
	set(['YS0M2FXHFUKK', 'KASAZL3RPKK6', 'ZILSSCBC40IR', 'NEFEWA5CEPMW', '8DGQWN7D24RW', 'G1FQA6E96794', 'XNP69S9V9849', 'X5YBR7LX367U', '7INXG6910I57', 'W6G19WDE9FBN']),
	set(['0TIBYZMOJD10', '3QBNSX4XCPSA', 'X3NC9RI7ZPUK', 'FRVXUX3X2S3R', 'V9GUVOSSR83H', '9ED47BUW3J9B', '1RY6YNAXRI7X', 'VWQTW530L7HU', 'MBA1GBU5A3MJ', 'FQR5NJPRAQ1T']),
	set(['27BMODQ3KSDY', '2WRJA9D9SEPC', 'Q6RVWKG553K7', '8S46FET9O2Y1', 'AG7PEPJHIALE', 'WJ9Y2OG0EKR7', 'PLXC6ZHQIVVA', 'YRTYMIDTOV1R', '2DM3J4TN9557', 'LBVFSL8OUUHG']),
	set(['L1EYAG4PN55N', 'WXA3PLRSG53G', '74SBBUUA94N3', 'AQ6XWF6SZZ3K', 'B45DHKLKJDYD', '5OM79AIPHX6W', 'ELVYERD2OSIT', '21USARENDKEH', 'VBEY9RLYA5IF', 'MZMYC75VUQCA']),
	set(['
	from __future__ import division
	import json
	import time
	import numpy as np
	from sklearn.feature_extraction.text import CountVectorizer
	from sklearn.cross_validation import train_test_split
	from sklearn.ensemble import RandomForestClassifier
	import cv2 # for this I needed to install opencv -> pip install opencv-python
	import numpy as np

	def getHashLshAlgorithm(img):
	# Resize to 9x8 pixels
	img = cv2.resize(img,(9,8))

	# Compare adjacent values (x>y)
	img_compared = np.empty((8,8))
	for i,row in enumerate(img):
	from __future__ import division,print_function
	import sqlite3
	import time
	import heapq
	from multiprocessing import Process, Pool

	# get connection to the database
	con = sqlite3.connect(r"C:\nice\simple\path\reddit.db")
	con.text_factory = str
	from __future__ import print_function
	import sqlite3
	import time
	from multiprocessing import Process, Pool
	import string
	import heapq


	# Create a connection to the database
	con = sqlite3.connect('/Users/some/cool/path/reddit.db')
	#!/usr/bin/env python

	from mrjob.job import MRJob
	from mrjob.step import MRStep
	import re
	import sys

	# The procedure is quite simple. In the mapper we simply add a degree for each node that appears in every edge.
	# we then use one reducer to sum up the occurance of each node which is the same as the degree of the node.
	# we then add 0 to the degree_array if the node has an even degree number else we add 1 to the degree array.
	from mrjob.job import MRJob
	from mrjob.step import MRStep

	class MinSale(MRJob):

	def mapping(self, _, line):
	data = line.strip().split(",")
	date, time, store, item, cost, payment = data
	yield (store,time), int(cost)