Skip to content

Instantly share code, notes, and snippets.

@rodrigore
Last active November 25, 2023 00:21
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save rodrigore/8c93ec5bb21b522d57638c92d98ae7d4 to your computer and use it in GitHub Desktop.
Save rodrigore/8c93ec5bb21b522d57638c92d98ae7d4 to your computer and use it in GitHub Desktop.

Ejercicio 1 - Carga de archivos en HDFS

# crear carpeta en hadoop
hdfs dfs -mkdir /movielens-20m

# descarga de archivos de s3 a carpeta loca
aws s3 cp s3://bigdata-desafio/challenges/u3act1/ movies/ --recursive

# copiar archivos descargados a la carpeta movielens-20m de hadoop
hdfs dfs -copyFromLocal movies/ /movielens-20m

# listado de contiendo /movielens-20m
hdfs dfs -ls /movielens-20m

# listado de streaming-jar
find /usr/lib -name '*streaming*' -print

Ejercicio 2 - Utilizando el archivo genome-scores.csv

cat movies/genome-scores.csv | python mapper_1.py | sort -k 1,1 | python reducer_1.py

chmod +x mapper_1.py

chmod +x reducer_1.py

hadoop jar /usr/lib/hadoop-mapreduce/hadoop-streaming-2.8.5-amzn-5.jar -file mapper_1.py -mapper mapper_1.py -file reducer_1.py -reducer reducer_1.py -input ///movielens-20m/genome-scores.csv -output register-results

hdfs dfs -getmerge register-results results_1.txt

Ejercicio 3

chmod +x mapper_2.py
chmod +x reducer_2.py

hadoop jar /usr/lib/hadoop-mapreduce/hadoop-streaming-2.8.5-amzn-5.jar -file mapper_2.py -mapper mapper_2.py -file reducer_2.py -reducer reducer_2.py -input ///movielens-20m/ratings.csv -output score_result_2

hdfs dfs -getmerge score_result_2 results_2.txt

Ejercicio 4

cat movies/ratings.csv | python mapper_3.py | sort -k 1,1 | python reducer_3.py

chmod +x mapper_3.py

chmod +x reducer_3.py

hadoop jar /usr/lib/hadoop-mapreduce/hadoop-streaming-2.8.5-amzn-5.jar -file mapper_3.py -mapper mapper_3.py -file reducer_3.py -reducer reducer_3.py -input ///movielens-20m/ratings.csv -output ejercicio4-result

hdfs dfs -getmerge ejercicio4-result results_3.txt

Ejercicio 5

cat movies/movies.csv | python mapper_4.py | sort -k 1,1 | python reducer_4.py

chmod +x mapper_4.py

chmod +x reducer_4.py

hadoop jar /usr/lib/hadoop-mapreduce/hadoop-streaming-2.8.5-amzn-5.jar -file mapper_4.py -mapper mapper_4.py -file reducer_4.py -reducer reducer_4.py -input ///movielens-20m/movies.csv -output ejercicio5-result

hdfs dfs -getmerge ejercicio5-result results_4.txt

Ejercicio 6

  • copiar archivos de la instancia al mac local
  • asume que la carpeta de archivos esta en desafio5
  • muevan la carpeta movies fuera del directorio donde estan los archivos pa que no se descargue
mkdir desafio-5
scp -i ~/desafio-gd-pem.pem "hadoop@ec2-3-219-170-196.compute-1.amazonaws.com:~/desafio5/*" desafio-5/
@isaiascardenas
Copy link

isaiascardenas commented Jan 8, 2020

mapper_1.py

#!/usr/bin/python3.6
import re, sys
feed_document = sys.stdin

for line_in_document in feed_document:
    (movie_id, tag_id, score) = line_in_document.split(',')
    print(tag_id.strip() + "," + score.strip())

@isaiascardenas
Copy link

isaiascardenas commented Jan 8, 2020

reducer_1.py

#!/usr/bin/python3.6

import sys

feed_mapper_output = sys.stdin
previous_counter = None
total_count = 0.0
same_tag = 0

for line_ocurrence in feed_mapper_output:
    tag_id , score = line_ocurrence.split(',')
    if tag_id != previous_counter:
        if previous_counter is not None:
            print(tag_id + "\t" + str(total_count/same_tag))
        previous_counter = tag_id
        total_count = 0.0
        same_tag = 0
    total_count += float(score.strip())
    same_tag += 1

print(tag_id + "\t" + str(total_count/same_tag))

@isaiascardenas
Copy link

Comando gigante

hadoop jar /usr/lib/hadoop-mapreduce/hadoop-streaming-2.8.5-amzn-5.jar -file mapper_1.py -mapper mapper_1.py -file reducer_1.py -reducer reducer_1.py -input ///movielens-20m/genome-scores.csv -output results_1.txt

@rodrigore
Copy link
Author

rodrigore commented Jan 9, 2020

mapper_3.py

#!/usr/bin/python3.6

import re, sys

feed_document = sys.stdin

for line_in_document in feed_document:
        (user_id, movie_id, rating, timestamp) = line_in_document.split(',')
        print(movie_id + ',' + rating.strip())

reducer_3.py

#!/usr/bin/python3.6
import sys

feed_mapper_output = sys.stdin
previous_counter = None
total_count = 0.0
same_movie = 0

for line_ocurrence in feed_mapper_output:
    movie_id , score = line_ocurrence.split(',')
    if movie_id != previous_counter:
        if previous_counter is not None:
            print(movie_id + "\t" + str(total_count/same_movie))
        previous_counter = movie_id
        total_count = 0.0
        same_movie = 0
    total_count += float(score.strip())
    same_movie += 1

print(movie_id + "\t" + str(total_count/same_movie))

@rodrigore
Copy link
Author

rodrigore commented Jan 9, 2020

mapper_2 .py

#!/usr/bin/python3.6
	
import re, sys

feed_document = sys.stdin

for line_in_document in feed_document:
	(user_id, movie_id, rating, timestamp) = line_in_document.split(',')
	print(user_id.strip() + ',' + rating.strip())

reducer_2 .py

#!/usr/bin/python3.6
import sys

feed_mapper_output = sys.stdin
previous_usr = None
total = 0.0
usr_count = 0

for line_ocurrence in feed_mapper_output:
    usr_id, rating = line_ocurrence.split(',')
    if usr_id != previous_usr:
        if previous_usr is not None:
            print(previous_usr + ',' + str(total/usr_count))
        previous_usr = usr_id
        total = 0
        usr_count = 1
    total += float(rating.strip())
    usr_count += 1

print(previous_usr + ',' + str(total/usr_count))

@JPabloix
Copy link

JPabloix commented Jan 9, 2020

mapper_4.py

#!/usr/bin/python3.6
import re, sys

feed_document = sys.stdin

for line_in_document in feed_document:
        movies = line_in_document.split(',')
        generos = movies[len(movies) - 1]

        generos_list = generos.strip().split('|')
        print(str(len(generos_list)))

reducer_4.py

#!/usr/bin/python3.6

import sys

feed_mapper_output = sys.stdin
previous_tag = None
count = 0

for line_ocurrence in feed_mapper_output:
    cantidad = line_ocurrence.strip()
    if cantidad != previous_tag:
        if previous_tag is not None:
            print(previous_tag + ',' + str(count))
        previous_tag = cantidad
        count = 0
    count += 1

print(previous_tag + ',' + str(count))

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment