Mike Taylor rolyatm

## gist:56c2b1a81df10173c9295d8d0fbd8a7a
aws cognito-idp initiate-auth --client-id <CLIENTID> --auth-flow USER_PASSWORD_AUTH --auth-parameters USERNAME=<USER>,PASSWORD="<PASSWORD>"

Response will contain a Session token.

aws cognito-idp admin-respond-to-auth-challenge --user-pool-id <USERPOOLID> --client-id <CLIENTID>   --challenge-responses "NEW_PASSWORD=<NEW>,USERNAME=<USER>" --challenge-name NEW_PASSWORD_REQUIRED --session "<SESSION TOKEN>"

## readme
Create a sample Spark application to process the example data and report some sort of interesting results.
Send the source code and your findings.

Example data:
  https://s3.amazonaws.com/ipsos-rad-sample-data/0000_part_00.gz
  https://s3.amazonaws.com/ipsos-rad-sample-data/0001_part_00.gz

Hint:
  The delimiter in the example data is a non-standard character.
  In Python I use: sc.textFile('file').map(lambda x: x.split(chr(31)))

## url_regex.py
'''
  url_pattern regex will very generously match URL patterns. It also matches numbers, email address and
  a few other funky cases.

  Please update the code below to eliminate the special cases.
'''
# very liberal match of a possible URL pattern
import re
url_pattern = '(([\w]+:)?//)?(([\d\w]|%[a-fA-f\d]{2,2})+(:([\d\w]|%[a-fA-f\d]{2,2})+)?@)?([\d\w][-\d\w]{0,253}[\d\w]?\.)+[\w]{2,63}(:[\d]+)?(/([-+_~.\d\w]|%[a-fA-f\d]{2,2})*)*(\?(&?([-+_~.\d\w]|%[a-fA-f\d]{2,2})=?)*)?(#([-+_~.\d\w]|%[a-fA-f\d]{2,2})*)?'
matched_url = []

## create_rtree_file_index.py
import json
from rtree import index

FILE = 'admin_v2.geojson'
OUTPUT = 'geotag_admin_v2'
# index settings
# http://libspatialindex.github.io/overview.html#references
LEAFCAPACITY = 100
INDEXCAPACITY = 100
FILLFACTOR = 0.7

## emojis.py
# -*- coding: utf-8 -*-
# Scorpion Emojis
# converts emoticons to emoji equivalents
# add synonyms of emojis for better search support

from __future__ import unicode_literals, print_function
import re

class Emojis():
	def __init__(self):

## tokenizer.py
# -*- coding: utf-8 -*-
# Scorpion Tokenizer
# includes Arabic specific stemmer

from __future__ import unicode_literals, print_function
import os
import re
import itertools
from nltk.tokenize import TweetTokenizer
from nltk.stem.porter import PorterStemmer

## nltk_stanford_segmenter.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Interface to the Stanford Segmenter
# for Chinese and Arabic
#
# Copyright (C) 2001-2017 NLTK Project
# Author: 52nlp <52nlpcn@gmail.com>
#         Casper Lehmann-Strøm <casperlehmann@gmail.com>
#         Alex Constantin <alex@keyworder.ch>
#
	aws cognito-idp initiate-auth --client-id <CLIENTID> --auth-flow USER_PASSWORD_AUTH --auth-parameters USERNAME=<USER>,PASSWORD="<PASSWORD>"

	Response will contain a Session token.

	aws cognito-idp admin-respond-to-auth-challenge --user-pool-id <USERPOOLID> --client-id <CLIENTID> --challenge-responses "NEW_PASSWORD=<NEW>,USERNAME=<USER>" --challenge-name NEW_PASSWORD_REQUIRED --session "<SESSION TOKEN>"
	Create a sample Spark application to process the example data and report some sort of interesting results.
	Send the source code and your findings.

	Example data:
	https://s3.amazonaws.com/ipsos-rad-sample-data/0000_part_00.gz
	https://s3.amazonaws.com/ipsos-rad-sample-data/0001_part_00.gz

	Hint:
	The delimiter in the example data is a non-standard character.
	In Python I use: sc.textFile('file').map(lambda x: x.split(chr(31)))
	'''
	url_pattern regex will very generously match URL patterns. It also matches numbers, email address and
	a few other funky cases.

	Please update the code below to eliminate the special cases.
	'''
	# very liberal match of a possible URL pattern
	import re
	url_pattern = '(([\w]+:)?//)?(([\d\w]\|%[a-fA-f\d]{2,2})+(:([\d\w]\|%[a-fA-f\d]{2,2})+)?@)?([\d\w][-\d\w]{0,253}[\d\w]?\.)+[\w]{2,63}(:[\d]+)?(/([-+_~.\d\w]\|%[a-fA-f\d]{2,2}))(\?(&?([-+_~.\d\w]\|%[a-fA-f\d]{2,2})=?))?(#([-+_~.\d\w]\|%[a-fA-f\d]{2,2}))?'
	matched_url = []
	import json
	from rtree import index

	FILE = 'admin_v2.geojson'
	OUTPUT = 'geotag_admin_v2'
	# index settings
	# http://libspatialindex.github.io/overview.html#references
	LEAFCAPACITY = 100
	INDEXCAPACITY = 100
	FILLFACTOR = 0.7
	# -- coding: utf-8 --
	# Scorpion Emojis
	# converts emoticons to emoji equivalents
	# add synonyms of emojis for better search support

	from __future__ import unicode_literals, print_function
	import re

	class Emojis():
	def __init__(self):
	# -- coding: utf-8 --
	# Scorpion Tokenizer
	# includes Arabic specific stemmer

	from __future__ import unicode_literals, print_function
	import os
	import re
	import itertools
	from nltk.tokenize import TweetTokenizer
	from nltk.stem.porter import PorterStemmer
	#!/usr/bin/env python
	# -- coding: utf-8 --
	# Natural Language Toolkit: Interface to the Stanford Segmenter
	# for Chinese and Arabic
	#
	# Copyright (C) 2001-2017 NLTK Project
	# Author: 52nlp <52nlpcn@gmail.com>
	# Casper Lehmann-Strøm <casperlehmann@gmail.com>
	# Alex Constantin <alex@keyworder.ch>
	#