Skip to content

Instantly share code, notes, and snippets.

@thuwarakeshm
Last active July 18, 2021 11:02
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save thuwarakeshm/12e6136e3f8890b5425f0b454eeae724 to your computer and use it in GitHub Desktop.
Save thuwarakeshm/12e6136e3f8890b5425f0b454eeae724 to your computer and use it in GitHub Desktop.
Tuplex
# FastETL configuration file
# created 2019-02-17 16:45:09.940033 UTC
tuplex:
- allowUndefinedBehavior: false
- autoUpcast: false
- csv:
- comments: ["#", "~"]
- generateParser: true
- maxDetectionMemory: 256KB
- maxDetectionRows: 100
- quotechar: "\""
- selectionPushdown: true
- separators: [",", ;, "|", "\t"]
- driverMemory: 1GB
- executorCount: 4
- executorMemory: 1GB
- logDir: .
- normalcaseThreshold: 0.9
- partitionSize: 1MB
- runTimeLibrary: tuplex_runtime
- runTimeMemory: 32MB
- runTimeMemoryBlockSize: 4MB
- scratchDir: /tmp
- useLLVMOptimizer: true
def count_primes(max_num):
"""This function counts of prime numbers below the input value.
Input values are in thousands, ie. 40, is 40,000.
"""
count = 0
for num in range(max_num * 1000 + 1):
if num > 1:
for i in range(2, num):
if num % i == 0:
break
else:
count += 1
return count
from tuplex import *
c = Context()
c.parallelize([(1, 0), (2, 1), (3, 0), (4, -1)]) \
.map(lambda x, y: x / y) \
.resolve(ZeroDivisionError, lambda a, b: 0) \
.collect()
from tuplex import *
c = Context()
c.parallelize([(1, 0), (2, 1), (3, 0), (4, -1)]) \
.map(lambda x, y: x / y) \
.collect()
%%time
for val in [10, 20, 30, 40]:
print(count_primes(val))
from multiprocessing import Pool
from datetime import datetime
def count_primes(max_num):
"""This function counts of prime numbers below the input value.
Input values are in thousands, ie. 40, is 40,000.
"""
count = 0
for num in range(max_num * 1000 + 1):
if num > 1:
for i in range(2, num):
if num % i == 0:
break
else:
count += 1
return count
if __name__ == "__main__":
start_time = datetime.now()
with Pool(5) as p:
print(p.map(count_primes, [10, 20, 30, 40]))
end_time = datetime.now()
print(f"It took {end_time - start_time} to run")
from tuplex import *
c = Context(executorMemory="2G")
from tuplex import *
c = Context(conf="/conf/tuplex.yaml")
from tuplex import *
c = Context()
# access elements via tuple syntax
# will print [11, 22, 33]
c.parallelize([(1, 10), (2, 20), (3, 30)]) \
.map(lambda x: x[0] + x[1]) \
.collect()
from tuplex import *
c = Context()
c.parallelize([10, 20, 30, 40]).map(count_primes).collect()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment