Jamie Thomson jamiekt

## CreateAzureBLOBusingPowershell
$method = "PUT"
$headerDate = '2014-02-14'
$headers = @{"x-ms-version"="$headerDate"}
$StorageAccountName = "<your account name>"
$StorageContainerName = "etl"
$StorageAccountKey = "<your account key>"
$Url = "https://$StorageAccountName.blob.core.windows.net/$StorageContainerName/stub.txt"
$body = "Hello world"
$xmsdate = (get-date -format r).ToString()
$headers.Add("x-ms-date",$xmsdate)

## POSHProfile.pssproj
<?xml version="1.0" encoding="utf-8"?>
<Project ToolsVersion="4.0" DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <PropertyGroup>
    <Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
    <SchemaVersion>2.0</SchemaVersion>
    <ProjectGuid>6CAFC0C6-A428-4d30-A9F9-700E829FEA51</ProjectGuid>
    <OutputType>Exe</OutputType>
    <RootNamespace>MyApplication</RootNamespace>
    <AssemblyName>MyApplication</AssemblyName>
    <Name>POSHProfile</Name>

## Parallel-vs-Serial.ps1
<#Simple comparison of a serial foreach loop versus a parallel foreach loop
When run on my workstation with $NumberOfIterations=50 this was my output:
    elapsed time (serial foreach loop): 20.2380236
    elapsed time (parallel foreach loop): 9.7779777
Simply copy and paste into Powershell ISE and hit F5 (needs Powershell v3 or above)
Jamie Thomson, 2014-12-09
#>
workflow workflow1{
    Param($NumberofIterations)
    "======================================================="

## BlogPosts
let
    Source = Web.Contents("http://sqlblog.com/blogs/Opml.aspx", [Headers=[#"Content-Type"="application/xml"]]),
    #"Imported XML" = Xml.Tables(Source),
    body = #"Imported XML"{0}[body],
    outline = body{0}[outline],
    #"Changed Type" = Table.TransformColumnTypes(outline,{{"Attribute:text", type text}}),
    outline1 = #"Changed Type"{0}[outline],
    #"Changed Type1" = Table.TransformColumnTypes(outline1,{{"Attribute:type", type text}, {"Attribute:text", type text}, {"Attribute:title", type text}, {"Attribute:description", type text}, {"Attribute:xmlUrl", type text}, {"Attribute:htmlUrl", type text}}),
    #"Removed Columns" = Table.RemoveColumns(#"Changed Type1",{"Attribute:type"}),
    #"Renamed Columns" = Table.RenameColumns(#"Removed Columns",{{"Attribute:text", "blogText"}, {"Attribute:title", "blogTitle"}, {"Attribute:description", "blogDescription"}, {"Attribute:xmlUrl", "blogRssUrl"}, {"Attribute:htmlUrl", "blogUrl"}}),

## Iterate over a collection using Redwood Script
{
  Partition partition = jcsJob.getJobDefinition().getPartition();
  JobDefinition jd = jcsSession.getJobDefinitionByName(partition, "JD_EchoInParameterValue_jamie_test");

  String[] sArr = inParam.split("\n",-1);
  int sArrLength = sArr.length;
  jcsOut.println("sArrLength=" + String.valueOf(sArrLength));
  int iterator = 0;
  while (iterator < sArrLength){
    jcsOut.println("sArr[Length]iterator]=" + String.valueOf(sArr[iterator]));

## tox.out
GLOB sdist-make: C:\git\github\datapackages\datapackage-py\setup.py
py27 inst-nodeps: C:\git\github\datapackages\datapackage-py\.tox\dist\datapackage-0.0.1.zip
py27 installed: chardet==2.3.0,click==6.3,colorama==0.3.7,coverage==4.0.3,coveralls==1.1,datapackage==0.0.1,docopt==0.6.2,et-xmlfile==1.0.1,funcsigs==0.4,functools32==3.2.3.post2,future==0.15.2,httpretty==0.8.10,ijson==2.3,jdcal==1.2,jsonschema==2.5.1,jsontableschema==0.5.1,mock==1.3.0,openpyxl==2.3.3,pbr==1.8.1,py==1.4.31,pytest==2.9.0,pytest-cov==2.2.1,python-dateutil==2.5.0,requests==2.9.1,rfc3987==1.3.5,six==1.10.0,tabulator==0.3.5,xlrd==0.9.4
py27 runtests: PYTHONHASHSEED='640'
py27 runtests: commands[0] | py.test --cov datapackage --cov-report term-missing --cov-config .coveragerc
============================= test session starts =============================
platform win32 -- Python 2.7.10, pytest-2.9.0, py-1.4.31, pluggy-0.3.1
rootdir: C:\git\github\datapackages\datapackage-py, inifile: pytest.ini
plugins: cov-2.2.1
collected 189 items

## tox.out
GLOB sdist-make: C:\git\github\datapackages\datapackage-py\setup.py
py27 inst-nodeps: C:\git\github\datapackages\datapackage-py\.tox\dist\datapackage-0.0.1.zip
py27 installed: chardet==2.3.0,click==6.3,colorama==0.3.7,coverage==4.0.3,coveralls==1.1,datapackage==0.0.1,docopt==0.6.2,et-xmlfile==1.0.1,funcsigs==0.4,functools32==3.2.3.post2,future==0.15.2,httpretty==0.8.10,ijson==2.3,jdcal==1.2,jsonschema==2.5.1,jsontableschema==0.5.1,mock==1.3.0,openpyxl==2.3.3,pbr==1.8.1,py==1.4.31,pytest==2.9.0,pytest-cov==2.2.1,python-dateutil==2.5.0,requests==2.9.1,rfc3987==1.3.5,six==1.10.0,tabulator==0.3.6,xlrd==0.9.4
py27 runtests: PYTHONHASHSEED='761'
py27 runtests: commands[0] | py.test --cov datapackage --cov-report term-missing --cov-config .coveragerc
============================= test session starts =============================
platform win32 -- Python 2.7.10, pytest-2.9.0, py-1.4.31, pluggy-0.3.1
rootdir: C:\git\github\datapackages\datapackage-py, inifile: pytest.ini
plugins: cov-2.2.1
collected 189 items

## write_file_and_spark_submit.sh
echo "from pyspark import SparkContext, HiveContext, SparkConf" > sparking.py
echo "conf = SparkConf().setAppName('sparking')" >> sparking.py
echo 'conf.set("spark.sql.parquet.binaryAsString", "true")' >> sparking.py
echo "sc = SparkContext(conf=conf)" >> sparking.py
echo "sqlContext = HiveContext(sc)" >> sparking.py
echo "l = [('Alice', 1)]" >> sparking.py
echo "rdd = sc.parallelize(l)" >> sparking.py
echo "for x in rdd.take(10):" >> sparking.py
echo "    print x" >> sparking.py
spark-submit  --master yarn --deploy-mode cluster --supervise --name "sparking" sparking.py

## POSH_Profile
#old version was getting complicated. This will do.
=get latest from https://gist.github.com/jamiekt/137d952e2d78bd74f4534cfc63fc7885

ipmo posh-git
Start-SshAgent
$env:path += ";" + (Get-Item "Env:\ProgramFiles").value + "\Git\bin"
$PSDefaultParameterValues["Out-Default:OutVariable"] = "___"


## .gitignore
project
target
metastore_db
derby.log
	$method = "PUT"
	$headerDate = '2014-02-14'
	$headers = @{"x-ms-version"="$headerDate"}
	$StorageAccountName = "<your account name>"
	$StorageContainerName = "etl"
	$StorageAccountKey = "<your account key>"
	$Url = "https://$StorageAccountName.blob.core.windows.net/$StorageContainerName/stub.txt"
	$body = "Hello world"
	$xmsdate = (get-date -format r).ToString()
	$headers.Add("x-ms-date",$xmsdate)
	<?xml version="1.0" encoding="utf-8"?>
	<Project ToolsVersion="4.0" DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
	<PropertyGroup>
	<Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
	<SchemaVersion>2.0</SchemaVersion>
	<ProjectGuid>6CAFC0C6-A428-4d30-A9F9-700E829FEA51</ProjectGuid>
	<OutputType>Exe</OutputType>
	<RootNamespace>MyApplication</RootNamespace>
	<AssemblyName>MyApplication</AssemblyName>
	<Name>POSHProfile</Name>
	<#Simple comparison of a serial foreach loop versus a parallel foreach loop
	When run on my workstation with $NumberOfIterations=50 this was my output:
	elapsed time (serial foreach loop): 20.2380236
	elapsed time (parallel foreach loop): 9.7779777
	Simply copy and paste into Powershell ISE and hit F5 (needs Powershell v3 or above)
	Jamie Thomson, 2014-12-09
	#>
	workflow workflow1{
	Param($NumberofIterations)
	"======================================================="
	let
	Source = Web.Contents("http://sqlblog.com/blogs/Opml.aspx", [Headers=[#"Content-Type"="application/xml"]]),
	#"Imported XML" = Xml.Tables(Source),
	body = #"Imported XML"{0}[body],
	outline = body{0}[outline],
	#"Changed Type" = Table.TransformColumnTypes(outline,{{"Attribute:text", type text}}),
	outline1 = #"Changed Type"{0}[outline],
	#"Changed Type1" = Table.TransformColumnTypes(outline1,{{"Attribute:type", type text}, {"Attribute:text", type text}, {"Attribute:title", type text}, {"Attribute:description", type text}, {"Attribute:xmlUrl", type text}, {"Attribute:htmlUrl", type text}}),
	#"Removed Columns" = Table.RemoveColumns(#"Changed Type1",{"Attribute:type"}),
	#"Renamed Columns" = Table.RenameColumns(#"Removed Columns",{{"Attribute:text", "blogText"}, {"Attribute:title", "blogTitle"}, {"Attribute:description", "blogDescription"}, {"Attribute:xmlUrl", "blogRssUrl"}, {"Attribute:htmlUrl", "blogUrl"}}),
	{
	Partition partition = jcsJob.getJobDefinition().getPartition();
	JobDefinition jd = jcsSession.getJobDefinitionByName(partition, "JD_EchoInParameterValue_jamie_test");

	String[] sArr = inParam.split("\n",-1);
	int sArrLength = sArr.length;
	jcsOut.println("sArrLength=" + String.valueOf(sArrLength));
	int iterator = 0;
	while (iterator < sArrLength){
	jcsOut.println("sArr[Length]iterator]=" + String.valueOf(sArr[iterator]));
	GLOB sdist-make: C:\git\github\datapackages\datapackage-py\setup.py
	py27 inst-nodeps: C:\git\github\datapackages\datapackage-py\.tox\dist\datapackage-0.0.1.zip
	py27 installed: chardet==2.3.0,click==6.3,colorama==0.3.7,coverage==4.0.3,coveralls==1.1,datapackage==0.0.1,docopt==0.6.2,et-xmlfile==1.0.1,funcsigs==0.4,functools32==3.2.3.post2,future==0.15.2,httpretty==0.8.10,ijson==2.3,jdcal==1.2,jsonschema==2.5.1,jsontableschema==0.5.1,mock==1.3.0,openpyxl==2.3.3,pbr==1.8.1,py==1.4.31,pytest==2.9.0,pytest-cov==2.2.1,python-dateutil==2.5.0,requests==2.9.1,rfc3987==1.3.5,six==1.10.0,tabulator==0.3.5,xlrd==0.9.4
	py27 runtests: PYTHONHASHSEED='640'
	py27 runtests: commands[0] \| py.test --cov datapackage --cov-report term-missing --cov-config .coveragerc
	============================= test session starts =============================
	platform win32 -- Python 2.7.10, pytest-2.9.0, py-1.4.31, pluggy-0.3.1
	rootdir: C:\git\github\datapackages\datapackage-py, inifile: pytest.ini
	plugins: cov-2.2.1
	collected 189 items
	echo "from pyspark import SparkContext, HiveContext, SparkConf" > sparking.py
	echo "conf = SparkConf().setAppName('sparking')" >> sparking.py
	echo 'conf.set("spark.sql.parquet.binaryAsString", "true")' >> sparking.py
	echo "sc = SparkContext(conf=conf)" >> sparking.py
	echo "sqlContext = HiveContext(sc)" >> sparking.py
	echo "l = [('Alice', 1)]" >> sparking.py
	echo "rdd = sc.parallelize(l)" >> sparking.py
	echo "for x in rdd.take(10):" >> sparking.py
	echo " print x" >> sparking.py
	spark-submit --master yarn --deploy-mode cluster --supervise --name "sparking" sparking.py
	#old version was getting complicated. This will do.
	=get latest from https://gist.github.com/jamiekt/137d952e2d78bd74f4534cfc63fc7885

	ipmo posh-git
	Start-SshAgent
	$env:path += ";" + (Get-Item "Env:\ProgramFiles").value + "\Git\bin"
	$PSDefaultParameterValues["Out-Default:OutVariable"] = "___"