This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from bs4 import BeautifulSoup | |
def make_soup(url): | |
response = requests.get(website) | |
return BeautifulSoup(response.text) | |
def initial_links(website): | |
soup = make_soup(website) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
dest = 'apache-hive-0.14.0-bin.tar.gz' | |
print os.stat(dest).st_size | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
hadoop fs -rmr ./AAA_AnnualReview | |
hadoop fs -rmr ./AAA_Employee | |
hadoop fs -rmr ./tbl_Underwriting_Extract | |
sqoop import --driver com.microsoft.sqlserver.jdbc.SQLServerDriver --connect 'jdbc:sqlserver://sa3sql500:51433;database=BigData' --table AAA_AnnualReview --username BigData_ETL_User --password BigData_ETL_User -m 1 | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import math | |
import string | |
from nltk.corpus import stopwords | |
from pyspark.sql.functions import udf | |
from pyspark.sql.types import * | |
def question_headline( headline): | |
keywords = ['?', 'should', 'can', 'if', 'is', 'would', 'why', 'how', 'when', "where"] | |
if any(word in headline for word in keywords): | |
return 1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from faker import Faker | |
fake = Faker() | |
outfile = 'data.csv' | |
outsize = 1024 * 1024 * 1024 # 1GB | |
with open(outfile, 'w') as csvfile: | |
size = 0 | |
while size < outsize: | |
txt = '%s, %s\n' % (fake.name(), fake.address()) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from faker import Faker | |
import datetime | |
import random | |
import sys | |
import csv | |
fake = Faker() | |
outfile = 'data_test.csv' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from faker import Faker | |
import datetime | |
import random | |
import sys | |
import csv | |
startTime = datetime.datetime.now() | |
fake = Faker() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?xml version="1.0" encoding="UTF-8" standalone="no"?> | |
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?><!-- | |
Licensed to the Apache Software Foundation (ASF) under one or more | |
contributor license agreements. See the NOTICE file distributed with | |
this work for additional information regarding copyright ownership. | |
The ASF licenses this file to You under the Apache License, Version 2.0 | |
(the "License"); you may not use this file except in compliance with | |
the License. You may obtain a copy of the License at | |
http://www.apache.org/licenses/LICENSE-2.0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import sys | |
spark_home = '/usr/local/spark' | |
sys.path.insert(0, spark_home + "/python") | |
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.1-src.zip')) | |
os.environ['PYSPARK_SUBMIT_ARGS'] = """--jars gcs-connector-latest-hadoop2.jar pyspark-shell""" | |
from pyspark import SparkContext, SparkConf | |
from pyspark.sql import SparkSession |
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
OlderNewer