Neil Patel neil90

## scrapingspeeches
import requests
from bs4 import BeautifulSoup

def make_soup(url):
	response = requests.get(website)
	return BeautifulSoup(response.text)


def initial_links(website):
	soup = make_soup(website)

## file size
import os
dest = 'apache-hive-0.14.0-bin.tar.gz'
print os.stat(dest).st_size


## sample_sqoop.sh
hadoop fs -rmr ./AAA_AnnualReview

hadoop fs -rmr ./AAA_Employee

hadoop fs -rmr ./tbl_Underwriting_Extract

sqoop import --driver com.microsoft.sqlserver.jdbc.SQLServerDriver --connect 'jdbc:sqlserver://sa3sql500:51433;database=BigData' --table AAA_AnnualReview --username BigData_ETL_User --password BigData_ETL_User -m 1


## features.py
import math
import string
from nltk.corpus import stopwords
from pyspark.sql.functions import udf
from pyspark.sql.types import *

def question_headline( headline):
	keywords = ['?', 'should', 'can', 'if', 'is', 'would', 'why', 'how', 'when', "where"]
	if any(word in headline for word in keywords):
		return 1

## fake_data.py
from faker import Faker
fake = Faker()

outfile = 'data.csv'
outsize = 1024 * 1024 * 1024 # 1GB

with open(outfile, 'w') as csvfile:
    size = 0
    while size < outsize:
        txt = '%s, %s\n' % (fake.name(), fake.address())

## test_data_builder.py
from faker import Faker

import datetime
import random
import sys
import csv

fake = Faker()

outfile = 'data_test.csv'

## test_data_builder.py
from faker import Faker

import datetime
import random
import sys
import csv

startTime = datetime.datetime.now()

fake = Faker()

## hive-site.xml
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?><!--
   Licensed to the Apache Software Foundation (ASF) under one or more
   contributor license agreements.  See the NOTICE file distributed with
   this work for additional information regarding copyright ownership.
   The ASF licenses this file to You under the Apache License, Version 2.0
   (the "License"); you may not use this file except in compliance with
   the License.  You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

## spark-gcs-example.py
import os
import sys

spark_home = '/usr/local/spark'
sys.path.insert(0, spark_home + "/python")
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.1-src.zip'))
os.environ['PYSPARK_SUBMIT_ARGS'] = """--jars gcs-connector-latest-hadoop2.jar pyspark-shell"""

from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

## SQL_to_Pandas.ipynb

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                neil90
                / SQL_to_Pandas.ipynb
            
            
              Created
              January 23, 2017 05:14
            
          
      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
	import requests
	from bs4 import BeautifulSoup

	def make_soup(url):
	response = requests.get(website)
	return BeautifulSoup(response.text)


	def initial_links(website):
	soup = make_soup(website)
	import os
	dest = 'apache-hive-0.14.0-bin.tar.gz'
	print os.stat(dest).st_size
	hadoop fs -rmr ./AAA_AnnualReview

	hadoop fs -rmr ./AAA_Employee

	hadoop fs -rmr ./tbl_Underwriting_Extract

	sqoop import --driver com.microsoft.sqlserver.jdbc.SQLServerDriver --connect 'jdbc:sqlserver://sa3sql500:51433;database=BigData' --table AAA_AnnualReview --username BigData_ETL_User --password BigData_ETL_User -m 1
	import math
	import string
	from nltk.corpus import stopwords
	from pyspark.sql.functions import udf
	from pyspark.sql.types import *

	def question_headline( headline):
	keywords = ['?', 'should', 'can', 'if', 'is', 'would', 'why', 'how', 'when', "where"]
	if any(word in headline for word in keywords):
	return 1
	from faker import Faker
	fake = Faker()

	outfile = 'data.csv'
	outsize = 1024 * 1024 * 1024 # 1GB

	with open(outfile, 'w') as csvfile:
	size = 0
	while size < outsize:
	txt = '%s, %s\n' % (fake.name(), fake.address())
	from faker import Faker

	import datetime
	import random
	import sys
	import csv

	fake = Faker()

	outfile = 'data_test.csv'
	<?xml version="1.0" encoding="UTF-8" standalone="no"?>
	<?xml-stylesheet type="text/xsl" href="configuration.xsl"?><!--
	Licensed to the Apache Software Foundation (ASF) under one or more
	contributor license agreements. See the NOTICE file distributed with
	this work for additional information regarding copyright ownership.
	The ASF licenses this file to You under the Apache License, Version 2.0
	(the "License"); you may not use this file except in compliance with
	the License. You may obtain a copy of the License at

	http://www.apache.org/licenses/LICENSE-2.0
	import os
	import sys

	spark_home = '/usr/local/spark'
	sys.path.insert(0, spark_home + "/python")
	sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.1-src.zip'))
	os.environ['PYSPARK_SUBMIT_ARGS'] = """--jars gcs-connector-latest-hadoop2.jar pyspark-shell"""

	from pyspark import SparkContext, SparkConf
	from pyspark.sql import SparkSession