Daniel Rodriguez danielfrg

## dask-spacy.ipynb

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                danielfrg
                / dask-spacy.ipynb
            
            
              Created
              November 13, 2015 23:39
            
          
      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## jython-pig.job
Register utils.py using jython as utils;

urls = LOAD 'INPUT_FILE' USING PigStorage('\t') AS (url:chararray);

query = FOREACH urls GENERATE utils.query(url) AS everything;

file = FOREACH query GENERATE FLATTEN(everything);

STORE file INTO 's3n://OUTPUT_DIR' USING PigStorage('\t');

## nutch-to-tdf.py
import pandas as pd

_input = 'dump0'
_output = 'html0.tdf'

df = pd.DataFrame({'url': [], 'html': []})

df.to_csv(_output, sep='\t', index=None)

def append_tdf(urls, html):

## rspark-test.ipynb

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                danielfrg
                / rspark-test.ipynb
            
            
              Created
              April 23, 2018 21:47
            
          
      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## merge-files-hdfs-count-pipeline.py
import json
import luigi
import luigi.hdfs
import luigi.hadoop
import pandas as pd

import numpy
import pandas
luigi.hadoop.attach(numpy, pandas)

## clean-html-solr-pipeline.py
import re
import json
import luigi
import pandas as pd
from mysolr import Solr
from bs4 import BeautifulSoup


class InputText(luigi.ExternalTask):

## keybase.md

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                danielfrg
                / keybase.md
            
            
              Created
              February 1, 2020 00:20
            
          
    Keybase proof

I hereby claim:

I am danielfrg on github.
I am danielfrg (https://keybase.io/danielfrg) on keybase.
I have a public key ASDYKve9COIyFov3ozEHC6eHuRZFZqPQq8b1ezthy4hNVgo

To claim this, I am signing this object:

  
## matplotlib.html
<!DOCTYPE html>
<html>
<head><meta charset="utf-8" />

<title>matplotlib</title><script src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.1.10/require.min.js"></script><link rel="stylesheet" href="https://unpkg.com/font-awesome@4.7.0/css/font-awesome.min.css" type="text/css" />


<style type="text/css">

## data-types.ipynb

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                danielfrg
                / data-types.ipynb
            
            
              Last active
              August 30, 2020 00:57
            
          
      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## set_campaign_goal.py
#!/usr/bin/env python
# Copyright 2018 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
	Register utils.py using jython as utils;

	urls = LOAD 'INPUT_FILE' USING PigStorage('\t') AS (url:chararray);

	query = FOREACH urls GENERATE utils.query(url) AS everything;

	file = FOREACH query GENERATE FLATTEN(everything);

	STORE file INTO 's3n://OUTPUT_DIR' USING PigStorage('\t');
	import pandas as pd

	_input = 'dump0'
	_output = 'html0.tdf'

	df = pd.DataFrame({'url': [], 'html': []})

	df.to_csv(_output, sep='\t', index=None)

	def append_tdf(urls, html):
	import json
	import luigi
	import luigi.hdfs
	import luigi.hadoop
	import pandas as pd

	import numpy
	import pandas
	luigi.hadoop.attach(numpy, pandas)
	import re
	import json
	import luigi
	import pandas as pd
	from mysolr import Solr
	from bs4 import BeautifulSoup


	class InputText(luigi.ExternalTask):
	<!DOCTYPE html>
	<html>
	<head><meta charset="utf-8" />

	<title>matplotlib</title><script src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.1.10/require.min.js"></script><link rel="stylesheet" href="https://unpkg.com/font-awesome@4.7.0/css/font-awesome.min.css" type="text/css" />




	<style type="text/css">
	#!/usr/bin/env python
	# Copyright 2018 Google LLC
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# https://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software