Skip to content

Instantly share code, notes, and snippets.

import requests
from bs4 import BeautifulSoup
def make_soup(url):
response = requests.get(website)
return BeautifulSoup(response.text)
def initial_links(website):
soup = make_soup(website)
import os
dest = 'apache-hive-0.14.0-bin.tar.gz'
print os.stat(dest).st_size
hadoop fs -rmr ./AAA_AnnualReview
hadoop fs -rmr ./AAA_Employee
hadoop fs -rmr ./tbl_Underwriting_Extract
sqoop import --driver com.microsoft.sqlserver.jdbc.SQLServerDriver --connect 'jdbc:sqlserver://sa3sql500:51433;database=BigData' --table AAA_AnnualReview --username BigData_ETL_User --password BigData_ETL_User -m 1
import math
import string
from nltk.corpus import stopwords
from pyspark.sql.functions import udf
from pyspark.sql.types import *
def question_headline( headline):
keywords = ['?', 'should', 'can', 'if', 'is', 'would', 'why', 'how', 'when', "where"]
if any(word in headline for word in keywords):
return 1
from faker import Faker
fake = Faker()
outfile = 'data.csv'
outsize = 1024 * 1024 * 1024 # 1GB
with open(outfile, 'w') as csvfile:
size = 0
while size < outsize:
txt = '%s, %s\n' % (fake.name(), fake.address())
from faker import Faker
import datetime
import random
import sys
import csv
fake = Faker()
outfile = 'data_test.csv'
from faker import Faker
import datetime
import random
import sys
import csv
startTime = datetime.datetime.now()
fake = Faker()
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?><!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
import os
import sys
spark_home = '/usr/local/spark'
sys.path.insert(0, spark_home + "/python")
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.1-src.zip'))
os.environ['PYSPARK_SUBMIT_ARGS'] = """--jars gcs-connector-latest-hadoop2.jar pyspark-shell"""
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.