Skip to content

Instantly share code, notes, and snippets.

@ruebot
Created May 14, 2020 15:54
Show Gist options
  • Save ruebot/d14d15f43da723ba6143807df047e09c to your computer and use it in GitHub Desktop.
Save ruebot/d14d15f43da723ba6143807df047e09c to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"from aut import *"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+-----------------------------------------------------------------------------------+----------------+\n",
"|url |http_status_code|\n",
"+-----------------------------------------------------------------------------------+----------------+\n",
"|http://geocities.com/babiekaos/Links.html |200 |\n",
"|http://geocities.com/cloneaccount3/6490/ |200 |\n",
"|http://www.geocities.com/coledale28/hi-power-soldiers-music.html |200 |\n",
"|http://www.geocities.com/orvilleduncan811/12-day-of-christmas-sheet-music.html |200 |\n",
"|http://geocities.com/jtbm71/fotos/2000/ |200 |\n",
"|http://geocities.com/cancmay/s/sunshine.html |200 |\n",
"|http://www.talent-direct.com/cgi-bin/tal_pro.cgi?profile=ARZCdYbJU5KsMARKdUxiO4l3DY|200 |\n",
"|http://geocities.com/akimi919/sp_ph/?M=A |200 |\n",
"|http://geocities.com/cancmay/s/save-tonight.html |200 |\n",
"|http://www.geocities.com/orvilleduncan811/child-youth-elbow-knee-pad.html |200 |\n",
"+-----------------------------------------------------------------------------------+----------------+\n",
"only showing top 10 rows\n",
"\n"
]
}
],
"source": [
"WebArchive(sc, sqlContext, \"/home/nruest/Projects/au/sample-data/geocities\")\\\n",
" .all()\\\n",
" .select(\"url\", \"http_status_code\")\\\n",
" .show(10, False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"WebArchive(sc, sqlContext, \"/home/nruest/Projects/au/sample-data/geocities\")\\\n",
" .all()\\\n",
" .select(\"url\", \"archive_filename\")\\\n",
" .show(10, True)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+---------------------------+-----+\n",
"|Domain |count|\n",
"+---------------------------+-----+\n",
"|geocities.com |93886|\n",
"|www.geocities.com |29223|\n",
"|www.infocastfn.com |430 |\n",
"|rcm.amazon.com |201 |\n",
"|www.bagus.com |133 |\n",
"|www.globalimagegallery.com |130 |\n",
"|www.physforum.com |124 |\n",
"|www.internetarchaeology.org|121 |\n",
"|us.geocities.com |121 |\n",
"|www.spb.tvoe.tv |108 |\n",
"+---------------------------+-----+\n",
"only showing top 10 rows\n",
"\n"
]
}
],
"source": [
"from pyspark.sql.functions import desc\n",
"\n",
"WebArchive(sc, sqlContext, \"/home/nruest/Projects/au/sample-data/geocities\")\\\n",
" .webpages()\\\n",
" .select(Udf.extract_domain(\"url\").alias(\"Domain\"))\\\n",
" .groupBy(\"Domain\")\\\n",
" .count()\\\n",
" .sort(desc(\"count\"))\\\n",
" .show(10, False)\n"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+----------+--------------------+--------------------+--------------------+\n",
"|crawl_date| domain| url| content|\n",
"+----------+--------------------+--------------------+--------------------+\n",
"| 20091027| geocities.com|http://geocities....|Sushi Land Sushi ...|\n",
"| 20091027| geocities.com|http://geocities....|Andrea Cruz Welco...|\n",
"| 20091027| www.geocities.com|http://www.geocit...|Hi Power Soldiers...|\n",
"| 20091027| www.geocities.com|http://www.geocit...|12 Day Of Christm...|\n",
"| 20091027| geocities.com|http://geocities....|Index of /jtbm71/...|\n",
"| 20091027| geocities.com|http://geocities....|sunshine CanCMay ...|\n",
"| 20091027|www.talent-direct...|http://www.talent...|talent direct voi...|\n",
"| 20091027| geocities.com|http://geocities....|Index of /akimi91...|\n",
"| 20091027| geocities.com|http://geocities....|stardust CanCMay ...|\n",
"| 20091027| www.geocities.com|http://www.geocit...|Child Youth Elbow...|\n",
"+----------+--------------------+--------------------+--------------------+\n",
"only showing top 10 rows\n",
"\n"
]
}
],
"source": [
"WebArchive(sc, sqlContext, \"/home/nruest/Projects/au/sample-data/geocities\")\\\n",
" .webpages()\\\n",
" .select(\"crawl_date\", Udf.extract_domain(\"url\").alias(\"domain\"), \"url\", Udf.remove_html(\"content\").alias(\"content\"))\\\n",
" .show(10, True)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+--------------------+\n",
"| content|\n",
"+--------------------+\n",
"|Sushi Land Sushi ...|\n",
"|Andrea Cruz Welco...|\n",
"|Hi Power Soldiers...|\n",
"|12 Day Of Christm...|\n",
"|Index of /jtbm71/...|\n",
"|sunshine CanCMay ...|\n",
"|talent direct voi...|\n",
"|Index of /akimi91...|\n",
"|stardust CanCMay ...|\n",
"|Child Youth Elbow...|\n",
"+--------------------+\n",
"only showing top 10 rows\n",
"\n"
]
}
],
"source": [
"WebArchive(sc, sqlContext, \"/home/nruest/Projects/au/sample-data/geocities\")\\\n",
" .webpages()\\\n",
" .select(Udf.remove_html(Udf.remove_http_header(\"content\")).alias(\"content\"))\\\n",
" .show(10, True)"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+----------+--------------------+--------------------+--------------------+\n",
"|crawl_date| domain| url| content|\n",
"+----------+--------------------+--------------------+--------------------+\n",
"| 20091027| geocities.com|http://geocities....|Sushi Land Sushi ...|\n",
"| 20091027| geocities.com|http://geocities....|Andrea Cruz Welco...|\n",
"| 20091027| www.geocities.com|http://www.geocit...|Hi Power Soldiers...|\n",
"| 20091027| www.geocities.com|http://www.geocit...|12 Day Of Christm...|\n",
"| 20091027| geocities.com|http://geocities....|Index of /jtbm71/...|\n",
"| 20091027| geocities.com|http://geocities....|sunshine CanCMay ...|\n",
"| 20091027|www.talent-direct...|http://www.talent...|talent direct voi...|\n",
"| 20091027| geocities.com|http://geocities....|Index of /akimi91...|\n",
"| 20091027| geocities.com|http://geocities....|stardust CanCMay ...|\n",
"| 20091027| www.geocities.com|http://www.geocit...|Child Youth Elbow...|\n",
"+----------+--------------------+--------------------+--------------------+\n",
"only showing top 10 rows\n",
"\n"
]
}
],
"source": [
"WebArchive(sc, sqlContext, \"/home/nruest/Projects/au/sample-data/geocities\")\\\n",
" .webpages()\\\n",
" .select(\"crawl_date\", Udf.extract_domain(\"url\").alias(\"domain\"), \"url\", Udf.remove_html(Udf.remove_http_header(\"content\")).alias(\"content\"))\\\n",
" .show(10, True)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+----------+--------------------+--------------------+--------------------+\n",
"|crawl_date| domain| url| content|\n",
"+----------+--------------------+--------------------+--------------------+\n",
"| 20091027| geocities.com|http://geocities....|Nori (seaweed) wa...|\n",
"| 20091027| geocities.com|http://geocities....| This site is about:|\n",
"| 20091027| www.geocities.com|http://www.geocit...|Hi Power Soldiers...|\n",
"| 20091027| www.geocities.com|http://www.geocit...|12 Day Of Christm...|\n",
"| 20091027| geocities.com|http://geocities....| |\n",
"| 20091027| geocities.com|http://geocities....|CanCMay Sunshine ...|\n",
"| 20091027|www.talent-direct...|http://www.talent...| |\n",
"| 20091027| geocities.com|http://geocities....| |\n",
"| 20091027| geocities.com|http://geocities....|Save Tonight Mind...|\n",
"| 20091027| www.geocities.com|http://www.geocit...|Child Youth Elbow...|\n",
"+----------+--------------------+--------------------+--------------------+\n",
"only showing top 10 rows\n",
"\n"
]
}
],
"source": [
"WebArchive(sc, sqlContext, \"/home/nruest/Projects/au/sample-data/geocities\")\\\n",
" .webpages()\\\n",
" .select(\"crawl_date\", Udf.extract_domain(\"url\").alias(\"domain\"), \"url\", Udf.extract_boilerplate(Udf.remove_http_header(\"content\")).alias(\"content\"))\\\n",
" .show(10, True)"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+----------+--------------------+--------------------+--------------------+\n",
"|crawl_date| domain| url| content|\n",
"+----------+--------------------+--------------------+--------------------+\n",
"| 20091027| geocities.com|http://geocities....|\r\n",
"<html>\r\n",
"\r\n",
"<head...|\n",
"| 20091027| geocities.com|http://geocities....|<html><head><titl...|\n",
"| 20091027| www.geocities.com|http://www.geocit...|<html>\r\n",
"\r\n",
"<head>\r",
"...|\n",
"| 20091027| www.geocities.com|http://www.geocit...|<html>\r\n",
"\r\n",
"<head>\r",
"...|\n",
"| 20091027| geocities.com|http://geocities....|<!DOCTYPE HTML PU...|\n",
"| 20091027| geocities.com|http://geocities....|<html>\r\n",
"<head><ti...|\n",
"| 20091027|www.talent-direct...|http://www.talent...|\r\n",
"\r\n",
"<!DOCTYPE htm...|\n",
"| 20091027| geocities.com|http://geocities....|<!DOCTYPE HTML PU...|\n",
"| 20091027| geocities.com|http://geocities....|<html>\r\n",
"<head><ti...|\n",
"| 20091027| www.geocities.com|http://www.geocit...|<html>\r\n",
"\r\n",
"<head>\r",
"...|\n",
"+----------+--------------------+--------------------+--------------------+\n",
"only showing top 10 rows\n",
"\n"
]
}
],
"source": [
"WebArchive(sc, sqlContext, \"/home/nruest/Projects/au/sample-data/geocities\")\\\n",
" .webpages()\\\n",
" .select(\"crawl_date\", Udf.extract_domain(\"url\").alias(\"domain\"), \"url\", Udf.remove_http_header(\"content\").alias(\"content\"))\\\n",
" .show(10, True)"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+--------------------+--------------------+--------------------+--------------------+--------------+------+-----+\n",
"| udf_sha1| sha1| udf_md5| md5|udf_image_size|height|width|\n",
"+--------------------+--------------------+--------------------+--------------------+--------------+------+-----+\n",
"|99d74a3b4fbd6d7cd...|9ca2bc31550f9369e...|ce1b5ab3e51fd9f6b...|ce4c718e925105232...| [0, 0]| 432| 288|\n",
"|245b94c90eac0dcd9...|ff0467d8d2cbc5d50...|9b8909a52d94b6d17...|f6b631a4db5f4c7a3...| [0, 0]| 103| 1200|\n",
"|cd19e4e7e2dd9e090...|faa81452f0c19b304...|a97c139a3a31467ae...|4f59788bde58d15d5...| [0, 0]| 1| 1|\n",
"|fd5eb52badba72a29...|0720946d3ced04976...|83ca84887072a62b9...|2677171223600bf34...| [0, 0]| 480| 1050|\n",
"|9333370d1f79af66c...|f9aa611fc62b735c3...|586628aaae7e0076a...|0a089830419a5c0ed...| [0, 0]| 315| 217|\n",
"|676b4a596a901024a...|5bb4bf5dfe39520a3...|dcec4d3ffac515f73...|a0210969ba9fac53a...| [0, 0]| 156| 136|\n",
"|4c99aa50462f84723...|b8a56b4dc015bdcc2...|fb51b7a1e1c25dc87...|c7d81ae036f502cf3...| [0, 0]| 32| 200|\n",
"|2bd92aea1b6370079...|0075394d3de702d27...|ad30253c36cb51e8a...|835fa6581c493ad15...| [0, 0]| 60| 600|\n",
"|86a7bcceae53c92b6...|2d99c303d7e8ca75f...|00628da87d6300e7c...|0134e45aca6297e8c...| [0, 0]| 36| 140|\n",
"|c5fa5c7c1a897f136...|429e3558e2b579426...|f9411618cf0a1c858...|a7b85484410cde43e...| [0, 0]| 640| 480|\n",
"+--------------------+--------------------+--------------------+--------------------+--------------+------+-----+\n",
"only showing top 10 rows\n",
"\n"
]
}
],
"source": [
"WebArchive(sc, sqlContext, \"/home/nruest/Projects/au/sample-data/geocities\")\\\n",
" .images()\\\n",
" .select(Udf.compute_sha1(\"bytes\").alias(\"udf_sha1\"), \"sha1\", Udf.compute_md5(\"bytes\").alias(\"udf_md5\"), \"md5\", Udf.compute_image_size(\"bytes\").alias(\"udf_image_size\"), \"height\", \"width\")\\\n",
" .show(10, True)"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+----------+------------+--------+\n",
"|crawl_date|udf_language|language|\n",
"+----------+------------+--------+\n",
"| 20091027| en| en|\n",
"| 20091027| en| en|\n",
"| 20091027| en| en|\n",
"| 20091027| en| en|\n",
"| 20091027| en| en|\n",
"| 20091027| en| en|\n",
"| 20091027| en| en|\n",
"| 20091027| en| ms|\n",
"| 20091027| en| en|\n",
"| 20091027| en| en|\n",
"+----------+------------+--------+\n",
"only showing top 10 rows\n",
"\n"
]
}
],
"source": [
"WebArchive(sc, sqlContext, \"/home/nruest/Projects/au/sample-data/geocities\")\\\n",
" .webpages()\\\n",
" .select(\"crawl_date\", Udf.detect_language(\"content\").alias(\"udf_language\"), \"language\")\\\n",
" .show(10, True)"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+----------+--------------------+--------------------+\n",
"|crawl_date| udf_tika| mime_type_tika|\n",
"+----------+--------------------+--------------------+\n",
"| 20091027| text/html| text/html|\n",
"| 20091027| text/html| text/html|\n",
"| 20091027| text/html| text/html|\n",
"| 20091027| text/html| text/html|\n",
"| 20091027| text/html| text/html|\n",
"| 20091027| text/html| text/html|\n",
"| 20091027|application/xhtml...|application/xhtml...|\n",
"| 20091027| text/html| text/html|\n",
"| 20091027| text/html| text/html|\n",
"| 20091027| text/html| text/html|\n",
"+----------+--------------------+--------------------+\n",
"only showing top 10 rows\n",
"\n"
]
}
],
"source": [
"WebArchive(sc, sqlContext, \"/home/nruest/Projects/au/sample-data/geocities\")\\\n",
" .all()\\\n",
" .select(\"crawl_date\", Udf.detect_mime_type_tika(\"bytes\").alias(\"udf_tika\"), \"mime_type_tika\")\\\n",
" .show(10, True)"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"ename": "Py4JJavaError",
"evalue": "An error occurred while calling o397.apply.\n: java.lang.ClassCastException: io.archivesunleashed.UdfLoader$$anonfun$hasContent$1 cannot be cast to scala.Function1\n\tat org.apache.spark.sql.catalyst.expressions.ScalaUDF.<init>(ScalaUDF.scala:104)\n\tat org.apache.spark.sql.expressions.UserDefinedFunction.apply(UserDefinedFunction.scala:85)\n\tat sun.reflect.GeneratedMethodAccessor70.invoke(Unknown Source)\n\tat sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\n\tat java.lang.reflect.Method.invoke(Method.java:498)\n\tat py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\n\tat py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\n\tat py4j.Gateway.invoke(Gateway.java:282)\n\tat py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\n\tat py4j.commands.CallCommand.execute(CallCommand.java:79)\n\tat py4j.GatewayConnection.run(GatewayConnection.java:238)\n\tat java.lang.Thread.run(Thread.java:748)\n",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mPy4JJavaError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-25-20fdfdb2bd6b>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mWebArchive\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msqlContext\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"/home/nruest/Projects/au/sample-data/geocities\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;31m\\\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;34m.\u001b[0m\u001b[0mwebpages\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;31m\\\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m \u001b[0;34m.\u001b[0m\u001b[0mfilter\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mUdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhas_content\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"content\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcontent\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;31m\\\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 6\u001b[0m \u001b[0;34m.\u001b[0m\u001b[0mselect\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"crawl_date\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mUdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mextract_domain\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"url\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0malias\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"domain\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"url\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mUdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mremove_http_header\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"content\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0malias\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"content\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;31m\\\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0;34m.\u001b[0m\u001b[0mshow\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m10\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/tmp/spark-ebab34db-5e87-43ba-9304-301861215262/userFiles-90a2a3e8-a131-4bcc-91fe-e9e52f8a8941/aut.zip/aut/udfs.py\u001b[0m in \u001b[0;36mhas_content\u001b[0;34m(col, content)\u001b[0m\n\u001b[1;32m 80\u001b[0m \u001b[0;34m.\u001b[0m\u001b[0mapply\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 81\u001b[0m )\n\u001b[0;32m---> 82\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mColumn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mudf\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0m_to_seq\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mcol\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0m_to_java_column\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcontent\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 83\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 84\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mremove_http_header\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcol\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/bin/spark-2.4.5-bin-hadoop2.7/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, *args)\u001b[0m\n\u001b[1;32m 1255\u001b[0m \u001b[0manswer\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgateway_client\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msend_command\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcommand\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1256\u001b[0m return_value = get_return_value(\n\u001b[0;32m-> 1257\u001b[0;31m answer, self.gateway_client, self.target_id, self.name)\n\u001b[0m\u001b[1;32m 1258\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1259\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mtemp_arg\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mtemp_args\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/bin/spark-2.4.5-bin-hadoop2.7/python/pyspark/sql/utils.py\u001b[0m in \u001b[0;36mdeco\u001b[0;34m(*a, **kw)\u001b[0m\n\u001b[1;32m 61\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mdeco\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkw\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 62\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 63\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkw\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 64\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mpy4j\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mprotocol\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mPy4JJavaError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 65\u001b[0m \u001b[0ms\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjava_exception\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtoString\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/bin/spark-2.4.5-bin-hadoop2.7/python/lib/py4j-0.10.7-src.zip/py4j/protocol.py\u001b[0m in \u001b[0;36mget_return_value\u001b[0;34m(answer, gateway_client, target_id, name)\u001b[0m\n\u001b[1;32m 326\u001b[0m raise Py4JJavaError(\n\u001b[1;32m 327\u001b[0m \u001b[0;34m\"An error occurred while calling {0}{1}{2}.\\n\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 328\u001b[0;31m format(target_id, \".\", name), value)\n\u001b[0m\u001b[1;32m 329\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 330\u001b[0m raise Py4JError(\n",
"\u001b[0;31mPy4JJavaError\u001b[0m: An error occurred while calling o397.apply.\n: java.lang.ClassCastException: io.archivesunleashed.UdfLoader$$anonfun$hasContent$1 cannot be cast to scala.Function1\n\tat org.apache.spark.sql.catalyst.expressions.ScalaUDF.<init>(ScalaUDF.scala:104)\n\tat org.apache.spark.sql.expressions.UserDefinedFunction.apply(UserDefinedFunction.scala:85)\n\tat sun.reflect.GeneratedMethodAccessor70.invoke(Unknown Source)\n\tat sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\n\tat java.lang.reflect.Method.invoke(Method.java:498)\n\tat py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\n\tat py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\n\tat py4j.Gateway.invoke(Gateway.java:282)\n\tat py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\n\tat py4j.commands.CallCommand.execute(CallCommand.java:79)\n\tat py4j.GatewayConnection.run(GatewayConnection.java:238)\n\tat java.lang.Thread.run(Thread.java:748)\n"
]
}
],
"source": [
"content = [\"radio\"]\n",
"\n",
"WebArchive(sc, sqlContext, \"/home/nruest/Projects/au/sample-data/geocities\")\\\n",
" .webpages()\\\n",
" .filter(Udf.has_content(\"content\", content))\\\n",
" .select(\"crawl_date\", Udf.extract_domain(\"url\").alias(\"domain\"), \"url\", Udf.remove_http_header(\"content\").alias(\"content\"))\\\n",
" .show(10, True)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment