Skip to content

Instantly share code, notes, and snippets.

@Nandan-N
Last active September 16, 2023 11:03
Show Gist options
  • Save Nandan-N/05009bb04a2b4fd36aa020a197a1218a to your computer and use it in GitHub Desktop.
Save Nandan-N/05009bb04a2b4fd36aa020a197a1218a to your computer and use it in GitHub Desktop.
hadoop jar $HADOOP_HOME/share/hadoop/tools/lib/hadoop-streaming-3.3.6.jar \
-mapper "$PWD/mapper.py" \
-reducer "$PWD/reducer.py" \
-input <path_to_input_in_hdfs> \
-output <path_to_output_folder_in_hdfs>
cat sample_data.json | ./mapper.py | sort -k 1,1 | ./reducer.py
#!/usr/bin/env python
import sys
import json
cat sample_data.json | ./mapper.py | sort -k 1,1 | ./reducer.py
hadoop jar $HADOOP_HOME/share/hadoop/tools/lib/hadoop-streaming-3.3.6.jar \
-mapper "$PWD/mapper.py" \
-reducer "$PWD/reducer.py" \
-input <path_to_input_in_hdfs> \
-output <path_to_output_folder_in_hdfs>
for line in sys.stdin:
try:
data = json.loads(line)
name = data["name"]
runs = float(data["runs"])
balls = float(data["balls"])
# Calculate local strike rate (rounding to 3 decimal places)
if balls > 0:
strike_rate = round((runs / balls) * 100, 3)
else:
strike_rate = 0.000 # If balls are 0, strike rate is also 0
# Emit the name and local strike rate as key-value pairs
print(f"{name}\t{strike_rate}")
except ValueError:
continue
#!/usr/bin/env python
import sys
current_name = None
total_runs = 0
total_balls = 0
for line in sys.stdin:
try:
name, strike_rate = line.strip().split("\t")
strike_rate = float(strike_rate)
# If the name changes, it's a new batsman, so emit the aggregate strike rate for the previous batsman
if current_name and current_name != name:
if total_balls > 0:
aggregate_strike_rate = round((total_runs / total_balls) * 100, 3)
else:
aggregate_strike_rate = 0.000
print(f'{{"name": "{current_name}", "strike_rate": {aggregate_strike_rate}}}')
# Reset counters for the new batsman
total_runs = 0
total_balls = 0
# Update counters for the current batsman
current_name = name
total_runs += strike_rate * 100 # Convert strike rate back to runs
total_balls += 100
except ValueError:
continue
# Emit the last batsman's aggregate strike rate
if current_name:
if total_balls > 0:
aggregate_strike_rate = round((total_runs / total_balls) * 100, 3)
else:
aggregate_strike_rate = 0.000
print(f'{{"name": "{current_name}", "strike_rate": {aggregate_strike_rate}}}')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment