Shriphani Palakodety shriphani

## decompress_and_recompress.py
"""
We have a bunch of files in the form of social.*.xz.gpg.save
These need to be moved to gzip format.
"""

import argparse
import os


def decrypt(filename):

## scrape-crawler-beans.cxml
<?xml version="1.0" encoding="UTF-8"?>
<!--

  clueweb12++ Crawl job configuration file
  ========================================

  This file is the template for the job configurations.

  It is based on the sample Heritrix 3 job. configuration file.

## kba_download_2013_stream.py
#!/usr/bin/env python

'''
Script to download the 2013 corpus
'''
import requests
import urlparse

from BeautifulSoup import BeautifulSoup, SoupStrainer

## nabble_heritrix_setup.py
#!/usr/bin/env python

'''
Our nabble crawl contains a ton of 503s. Poll heritrix and set up new jobs
'''

import argparse
import daemon
import os
import sys

## pom.xml
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  <modelVersion>4.0.0</modelVersion>

  <groupId>lemurproject</groupId>
  <artifactId>sutime-clojure</artifactId>
  <version>0.0.1-SNAPSHOT</version>
  <packaging>jar</packaging>

  <name>sutime-clojure</name>

## TestSUTime.java
import edu.stanford.nlp.*;
import edu.stanford.nlp.pipeline.*;
import edu.stanford.nlp.time.TimeAnnotations;
import edu.stanford.nlp.util.CoreMap;
import edu.stanford.nlp.ling.*;
import edu.stanford.nlp.ling.CoreAnnotations.NamedEntityTagAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.NormalizedNamedEntityTagAnnotation;
import edu.stanford.nlp.ling.tokensregex.types.Expressions.VarAssignmentExpression;

import java.util.*;

## redirect_resolve.py
'''Resolve a url'''

import argparse
import requests


def resolve_url(url):
	return (requests.get(url)).url

if __name__ == '__main__':

## cmd_heritrix.py
'''
Heritrix control from the command line
All control can be done by using job directories on the command line
'''

import argparse
import os


def stop_job(job_dir):

## ygroups_pauser.py
'''
The purpose of this script is to keep pausing / unpausing
the ygroups download
'''

import argparse
import os
import sys
import time

## nabble_scrape_index_pages.py
'''
Index pages scraper that goes through and finds the most recent pages
'''

import argparse
import os
import sys
import warc

from BeautifulSoup import BeautifulSoup, SoupStrainer
	"""
	We have a bunch of files in the form of social.*.xz.gpg.save
	These need to be moved to gzip format.
	"""

	import argparse
	import os


	def decrypt(filename):
	<?xml version="1.0" encoding="UTF-8"?>
	<!--

	clueweb12++ Crawl job configuration file
	========================================

	This file is the template for the job configurations.

	It is based on the sample Heritrix 3 job. configuration file.
	#!/usr/bin/env python

	'''
	Script to download the 2013 corpus
	'''
	import requests
	import urlparse

	from BeautifulSoup import BeautifulSoup, SoupStrainer
	#!/usr/bin/env python

	'''
	Our nabble crawl contains a ton of 503s. Poll heritrix and set up new jobs
	'''

	import argparse
	import daemon
	import os
	import sys
	<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
	xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
	<modelVersion>4.0.0</modelVersion>

	<groupId>lemurproject</groupId>
	<artifactId>sutime-clojure</artifactId>
	<version>0.0.1-SNAPSHOT</version>
	<packaging>jar</packaging>

	<name>sutime-clojure</name>
	import edu.stanford.nlp.*;
	import edu.stanford.nlp.pipeline.*;
	import edu.stanford.nlp.time.TimeAnnotations;
	import edu.stanford.nlp.util.CoreMap;
	import edu.stanford.nlp.ling.*;
	import edu.stanford.nlp.ling.CoreAnnotations.NamedEntityTagAnnotation;
	import edu.stanford.nlp.ling.CoreAnnotations.NormalizedNamedEntityTagAnnotation;
	import edu.stanford.nlp.ling.tokensregex.types.Expressions.VarAssignmentExpression;

	import java.util.*;
	'''Resolve a url'''

	import argparse
	import requests


	def resolve_url(url):
	return (requests.get(url)).url

	if __name__ == '__main__':
	'''
	Heritrix control from the command line
	All control can be done by using job directories on the command line
	'''

	import argparse
	import os


	def stop_job(job_dir):
	'''
	The purpose of this script is to keep pausing / unpausing
	the ygroups download
	'''

	import argparse
	import os
	import sys
	import time
	'''
	Index pages scraper that goes through and finds the most recent pages
	'''

	import argparse
	import os
	import sys
	import warc

	from BeautifulSoup import BeautifulSoup, SoupStrainer