Russell Jurney rjurney

## base_class.py
class GitRepo(GraphletBaseClass):
    """A GitRepo is a source code repository in a knowledge graph about the open source ecosystem."""

    structure = "node"
    name = "repo"

    def __init__(self, name, url, description, platform):
        """Instantiate a Git Repo given its properties"""

        self._id = uuid.uuid4()

## spark_google_sheets.py
from pyspark.sql import SparkSession


# Load the package https://github.com/potix2/spark-google-spreadsheets via Maven
spark = (
    SparkSession.builder
    .appName("Testing Spark Google Sheets")
    .config("spark.jars.packages", "com.github.potix2:spark-google-spreadsheets_2.11:0.6.3")
    .getOrCreate()
)

## latest
#!/bin/bash

#
# I search recursively for the latest modified files. I work on Mac OS X. I am incredibly useful! Thanks cobbzilla!
#
# I come from https://stackoverflow.com/questions/5566310/how-to-recursively-find-and-list-the-latest-modified-files-in-a-directory-with-s#comment57080115_7448828
# Which was created by https://stackoverflow.com/users/1251543/cobbzilla
#

FILE_PATH=$1

## README.md

      
              1 file
            
          
              0 forks
            
          
              3 comments
            
          
              0 stars
            
          
                rjurney
                / README.md
            
            
              Last active
              March 17, 2021 01:32
            
              
                Our README process for versioning files under DVC - is this right?
              
          
    Models and their Files

In addition to storing and versioning data under the data/ directory, we store and version models in the models/ directory and version them using DVC. On PySpark we store models on the nodes of the cluster at /tmp/models/ so you will need to copy them there and use spark-specific paths when using PySpark.
In order to add a file to DVC, you need to run this series of commands. Failure to run them all can result in the deletion of files recently added by your co-workers [:(]. It is very important that you git pull and dvc pull, one after the other, before you add files and dvc push. You can add new files to DVC this way or re-add them to update the version.
# Get the latest .dvc files and configuration for the project
git pull origin <branch>

  
## gist:a0f31d30e1ec9975865da20ab02976b2
$ dvc add data/demo/Demo\ triples\ and\ properties\ data\ -\ 20200202.xlsx
100% Add|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████|1/1 [00:00,  1.41file/s]

$ git add -f data/demo/Demo\ triples\ and\ properties\ data\ -\ 20200202.xlsx.dvc
$ git commit -m "Now tracking Demo 1/2 dataset in DVC"

[43-demo-data-json ddb47e9] Now tracking Demo 1/2 dataset in DVC
 1 file changed, 4 insertions(+)
 create mode 100644 data/demo/Demo triples and properties data - 20200202.xlsx.dvc

## a_working_pickle.py
import pickle

b = pickle.dumps(afrozendict({"hello": "bob"}))
p = pickle.loads(b)
p

## results.py
[One, year]
[in, an]
[One]
[self, -]
[cafe, .]
[week, ,]
[-]
[week, ,, a]
[-, confessed]
[religious, extremism]

## code.py
from spacy.cli import download
from spacy.tokens import DocBin

# Load the spaCy transformers model based on English web content
download("en_core_web_trf")
# download("en_core_web_lg")
nlp = spacy.load("en_core_web_trf")

# Store the documents of the articles because the transformer model is slow
doc_bin = DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"], store_user_data=False)

## bulk_load_elastic.sh
while IFS= read -r "path" < "${DOC_LIST_PATH}"
do
    echo "Submitting Elastic formatted docs at ${path} to Elastic index 'docs' ..."

    curl \
    -X POST \
    -H "Content-Type: application/x-ndjson" \
    "https://${ES_USER}:${ES_PASSWD}@${ES_HOSTNAME}:${ES_PORT}/docs/_bulk" \
    --data-binary "@${path}"
done

## load_directory_json.gz.sh
# Bulk load the Foo data we prepared via PySpark in etl/transform_foo.spark.py
for path in data/foo/elastic/part*
do
    file=$(basename ${path})
    echo "Submitting ${path} to Elastic index foo ..."

    curl ${USER_STRING} \
    -X POST \
    -H "Content-encoding: gzip" \
    -H "Content-Type: application/x-ndjson" \
	class GitRepo(GraphletBaseClass):
	"""A GitRepo is a source code repository in a knowledge graph about the open source ecosystem."""

	structure = "node"
	name = "repo"

	def __init__(self, name, url, description, platform):
	"""Instantiate a Git Repo given its properties"""

	self._id = uuid.uuid4()
	from pyspark.sql import SparkSession


	# Load the package https://github.com/potix2/spark-google-spreadsheets via Maven
	spark = (
	SparkSession.builder
	.appName("Testing Spark Google Sheets")
	.config("spark.jars.packages", "com.github.potix2:spark-google-spreadsheets_2.11:0.6.3")
	.getOrCreate()
	)
	#!/bin/bash

	#
	# I search recursively for the latest modified files. I work on Mac OS X. I am incredibly useful! Thanks cobbzilla!
	#
	# I come from https://stackoverflow.com/questions/5566310/how-to-recursively-find-and-list-the-latest-modified-files-in-a-directory-with-s#comment57080115_7448828
	# Which was created by https://stackoverflow.com/users/1251543/cobbzilla
	#

	FILE_PATH=$1
	$ dvc add data/demo/Demo\ triples\ and\ properties\ data\ -\ 20200202.xlsx
	100% Add\|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████\|1/1 [00:00, 1.41file/s]

	$ git add -f data/demo/Demo\ triples\ and\ properties\ data\ -\ 20200202.xlsx.dvc
	$ git commit -m "Now tracking Demo 1/2 dataset in DVC"

	[43-demo-data-json ddb47e9] Now tracking Demo 1/2 dataset in DVC
	1 file changed, 4 insertions(+)
	create mode 100644 data/demo/Demo triples and properties data - 20200202.xlsx.dvc
	import pickle

	b = pickle.dumps(afrozendict({"hello": "bob"}))
	p = pickle.loads(b)
	p
	[One, year]
	[in, an]
	[One]
	[self, -]
	[cafe, .]
	[week, ,]
	[-]
	[week, ,, a]
	[-, confessed]
	[religious, extremism]
	from spacy.cli import download
	from spacy.tokens import DocBin

	# Load the spaCy transformers model based on English web content
	download("en_core_web_trf")
	# download("en_core_web_lg")
	nlp = spacy.load("en_core_web_trf")

	# Store the documents of the articles because the transformer model is slow
	doc_bin = DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"], store_user_data=False)
	while IFS= read -r "path" < "${DOC_LIST_PATH}"
	do
	echo "Submitting Elastic formatted docs at ${path} to Elastic index 'docs' ..."

	curl \
	-X POST \
	-H "Content-Type: application/x-ndjson" \
	"https://${ES_USER}:${ES_PASSWD}@${ES_HOSTNAME}:${ES_PORT}/docs/_bulk" \
	--data-binary "@${path}"
	done
	# Bulk load the Foo data we prepared via PySpark in etl/transform_foo.spark.py
	for path in data/foo/elastic/part*
	do
	file=$(basename ${path})
	echo "Submitting ${path} to Elastic index foo ..."

	curl ${USER_STRING} \
	-X POST \
	-H "Content-encoding: gzip" \
	-H "Content-Type: application/x-ndjson" \