Sebastian Nagel sebastian-nagel

## get_dmoz_news_links.sh
#!/bin/bash

#### extract news sites from DMOZ.org ####

# dependencies
#  Linux
#  bash
#  wget
#  perl
#  regexp-assemble

## cs_despam_host_pagerank.py
import fileinput
import sys
import tldextract
from _collections import defaultdict
from math import log


RANK_DIVERGENCE_THR = 0.02
HOST_LENGTH_DIVERGENCE_THR = 0.15

## jython_webgraph_commands.sh
### Jython
# install Jython (see https://www.jython.org/download)
wget https://repo1.maven.org/maven2/org/python/jython-standalone/2.7.2/jython-standalone-2.7.2.jar

# clone pywebgraph (fork with modifications)
git clone https://github.com/commoncrawl/py-web-graph.git
cd py-web-graph
# copy console.py into current working directory so that "pywebgraph" is visible as package
cp pywebgraph/console.py .

## REAMDE.md

      
              4 files
            
          
              0 forks
            
          
              0 comments
            
          
              1 star
            
          
                sebastian-nagel
                / REAMDE.md
            
            
              Created
              October 21, 2019 13:05
            
              
                character set and content language correlations
              
          
    correlation metrics between character set and content language

data: Common Crawl September 2019 data set
SQL query on columnar index executed via AWS Athena


## iterate_wet_file.py
from warcio.archiveiterator import ArchiveIterator

with open('path/to/file.wet.gz', 'rb') as stream:
  for record in ArchiveIterator(stream):
    if record.rec_type == 'conversion':
      url = record.rec_headers.get_header('WARC-Target-URI')
      text = record.content_stream().read().decode('utf-8')

## sitemap-index-cc-224-trim-ws.xml
<?xml version="1.0" encoding="UTF-8"?>
<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
 <sitemap>
   <loc>
     <![CDATA[ http://www.example.com/sitemap1.xml ]]>
   </loc>
   <lastmod>
     <![CDATA[ 2018-12-12 02:06:56 ]]>
   </lastmod>
 </sitemap>

## cdx_get_warc_record.py
import fileinput
import sys

import boto3
import botocore

import ujson as json


no_sign_request = botocore.client.Config(

## watlinks.path.freq.txt
#% zgrep '^{"Container' .../CC-MAIN-XXX-XXX.warc.wat.gz \
#  | jq --raw-output '."Envelope"."Payload-Metadata"."HTTP-Response-Metadata"."HTML-Metadata"."Links"[]?.path' \
#  | sort | uniq -c | sort -k1,1nr
# see also:
#   https://github.com/commoncrawl/ia-web-commons/issues/9
#   https://github.com/commoncrawl/ia-web-commons/issues/8
#   https://github.com/iipc/webarchive-commons/pull/72
7777908	A@/href
1266284	IMG@/src
90022	STYLE/#text

## pyspark_executor_hangup.py
# hanging executor on Spark 2.1.0 and Python 2.7

from pyspark import SparkContext


class BadEncodedException(Exception):
    def __init__(self, reason):
        self.msg = str(reason)
        super(BadEncodedException, self).__init__(self.msg)

## common-crawl-cdx.py
# -*- coding: utf-8 -*-
"""
common-crawl-cdx.py

A simple example program to analyze the Common Crawl index.

This is implemented as a single stream job which accesses S3 via HTTP,
so that it can be easily be run from any laptop, but it could easily be
converted to an EMR job which processed the 300 index files in parallel.
	#!/bin/bash

	#### extract news sites from DMOZ.org ####

	# dependencies
	# Linux
	# bash
	# wget
	# perl
	# regexp-assemble
	import fileinput
	import sys
	import tldextract
	from _collections import defaultdict
	from math import log


	RANK_DIVERGENCE_THR = 0.02
	HOST_LENGTH_DIVERGENCE_THR = 0.15
	### Jython
	# install Jython (see https://www.jython.org/download)
	wget https://repo1.maven.org/maven2/org/python/jython-standalone/2.7.2/jython-standalone-2.7.2.jar

	# clone pywebgraph (fork with modifications)
	git clone https://github.com/commoncrawl/py-web-graph.git
	cd py-web-graph
	# copy console.py into current working directory so that "pywebgraph" is visible as package
	cp pywebgraph/console.py .
	from warcio.archiveiterator import ArchiveIterator

	with open('path/to/file.wet.gz', 'rb') as stream:
	for record in ArchiveIterator(stream):
	if record.rec_type == 'conversion':
	url = record.rec_headers.get_header('WARC-Target-URI')
	text = record.content_stream().read().decode('utf-8')
	<?xml version="1.0" encoding="UTF-8"?>
	<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
	<sitemap>
	<loc>
	<![CDATA[ http://www.example.com/sitemap1.xml ]]>
	</loc>
	<lastmod>
	<![CDATA[ 2018-12-12 02:06:56 ]]>
	</lastmod>
	</sitemap>
	import fileinput
	import sys

	import boto3
	import botocore

	import ujson as json


	no_sign_request = botocore.client.Config(
	#% zgrep '^{"Container' .../CC-MAIN-XXX-XXX.warc.wat.gz \
	# \| jq --raw-output '."Envelope"."Payload-Metadata"."HTTP-Response-Metadata"."HTML-Metadata"."Links"[]?.path' \
	# \| sort \| uniq -c \| sort -k1,1nr
	# see also:
	# https://github.com/commoncrawl/ia-web-commons/issues/9
	# https://github.com/commoncrawl/ia-web-commons/issues/8
	# https://github.com/iipc/webarchive-commons/pull/72
	7777908 A@/href
	1266284 IMG@/src
	90022 STYLE/#text
	# hanging executor on Spark 2.1.0 and Python 2.7

	from pyspark import SparkContext


	class BadEncodedException(Exception):
	def __init__(self, reason):
	self.msg = str(reason)
	super(BadEncodedException, self).__init__(self.msg)
	# -- coding: utf-8 --
	"""
	common-crawl-cdx.py

	A simple example program to analyze the Common Crawl index.

	This is implemented as a single stream job which accesses S3 via HTTP,
	so that it can be easily be run from any laptop, but it could easily be
	converted to an EMR job which processed the 300 index files in parallel.