Skip to content

Instantly share code, notes, and snippets.

View amacal's full-sized avatar

Adrian Macal amacal

View GitHub Profile
resource "aws_s3_bucket" "data" {
bucket = "wikipedia-${var.account_id}"
force_destroy = true
}
resource "aws_iam_role" "role" {
name = "glue-role"
assume_role_policy = <<EOF
{
"Version": "2012-10-17",
from boto3 import client
from ftplib import FTP
class Parameters:
def __init__(self):
self.ssmClient = client('ssm')
def value(self, name):
return self.ssmClient.get_parameter(Name=name)['Parameter']['Value']
from boto3 import client
from ftplib import FTP
from gzip import GzipFile
from os import path
class Parameters:
def __init__(self):
self.ssmClient = client('ssm')
<logitem>
<id>49914242</id>
<timestamp>2013-07-15T14:10:45Z</timestamp>
<contributor>
<username>DragonflySixtyseven</username>
<id>62058</id>
</contributor>
<comment>a Wikipedia account is to be used by only one person, and is not to be used to represent an organization as a whole. You are all welcome to create individual accounts.</comment>
<type>block</type>
<action>block</action>
from boto3 import client
from ftplib import FTP
from gzip import GzipFile
from os import path
class Parameters:
def __init__(self):
self.ssmClient = client('ssm')
resource "aws_glue_catalog_database" "wikipedia_database" {
name = "wikipedia"
}
resource "aws_glue_classifier" "logitem_xml" {
name = "wikipedia-logitem-xml"
xml_classifier {
classification = "xml"
row_tag = "logitem"
filename = 'enwiki-20201020-pages-logging.xml.gz'
directory = 'mirror/wikimedia.org/dumps/enwiki/20201020/'
pipeline = Pipeline(name=filename, steps=[
FtpDownload(host='ftp.acc.umu.se', directory=directory),
MD5Hash('source:md5'),
SHA1Hash('source:sha1'),
Ungzip(),
MD5Hash('destination:md5'),
SHA1Hash('destination:sha1'),
class Component:
# connects infrastructure before first call
def bind(self, prev, next, metrics, metadata):
pass
# called to flush its own data before closing pipeline
def flush(self):
pass
# starts the pipeline, called only in first component
class Pipeline:
def __init__(self, name, steps):
self.steps = steps
self.metadata = Metadata()
self.metrics = Metrics(name)
def init(self):
prev = None
for step in self.steps:
next = BinaryPipe()
from boto3 import client
from ftplib import FTP
from gzip import GzipFile
from os import path
from time import time as now
from hashlib import md5, sha1
class Parameters:
def __init__(self):