Skip to content

Instantly share code, notes, and snippets.

Created December 1, 2014 12:02
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save anonymous/bf63359fc7d45f16effc to your computer and use it in GitHub Desktop.
Save anonymous/bf63359fc7d45f16effc to your computer and use it in GitHub Desktop.
./10/accessmylibrary.com_10089a9c136ab649995090e2f42eb15c.html.txt: /*<![CDATA[*//*---->*/
./19/greendot.com_19c67948dbbefd17e979c59b283efba4.html.txt: /* <![CDATA[ */
./1e/ighome.com_1e1c95f9d4f96a2c7890f393b6398950.html.txt://<![CDATA[
./22/walmartmoneycard.com_22a1f0425c8ce79dd43af2a9f1de624e.html.txt: /* <![CDATA[ */
./2e/agweb.com_2eaca0dcef26c1ccd24aea8be2dfaeb6.html.txt: //<![CDATA[
./2e/agweb.com_2eaca0dcef26c1ccd24aea8be2dfaeb6.html.txt://<![CDATA[
./2f/rateitall.com_2f856f560dd3d8b494cdad8ac8296f75.html.txt://<![CDATA[
./37/tajhotels.com_376cbe0e740d3c7d0e3c03443a0380bf.html.txt://<![CDATA[
./47/tileshop.com_47cba1fbd4561ebd6ed902ebc6f85521.html.txt://<![CDATA[
./47/tileshop.com_47cba1fbd4561ebd6ed902ebc6f85521.html.txt://<![CDATA[
./47/tileshop.com_47cba1fbd4561ebd6ed902ebc6f85521.html.txt: /* <![CDATA[ */
./51/mobi-book.com_5164e862bffe7b70b08f7615ccd8a563.html.txt: //<![CDATA[
./51/mobi-book.com_5164e862bffe7b70b08f7615ccd8a563.html.txt: //<![CDATA[
./51/mobi-book.com_5164e862bffe7b70b08f7615ccd8a563.html.txt: //<![CDATA[
./51/mobi-book.com_5164e862bffe7b70b08f7615ccd8a563.html.txt: //<![CDATA[
./51/mobi-book.com_5164e862bffe7b70b08f7615ccd8a563.html.txt: //<![CDATA[
./51/mobi-book.com_5164e862bffe7b70b08f7615ccd8a563.html.txt: //<![CDATA[
./51/mobi-book.com_5164e862bffe7b70b08f7615ccd8a563.html.txt: //<![CDATA[
./57/guiadinmuebles.com_57e5fc768ae98d463368d0a5566d295f.html.txt: //<![CDATA[
./74/ielts.org_74d9fe19a3149dfadbfafa997d9c2743.html.txt://<![CDATA[
./74/ielts.org_74d9fe19a3149dfadbfafa997d9c2743.html.txt: //<![CDATA[
./74/ielts.org_74d9fe19a3149dfadbfafa997d9c2743.html.txt: //<![CDATA[
./75/encyclopedia.com_75b576cf163c631e6bfc9ada9cbc26d7.html.txt: /*<![CDATA[*//*---->*/
./75/encyclopedia.com_75b576cf163c631e6bfc9ada9cbc26d7.html.txt: /*<![CDATA[*//*---->*/
./75/encyclopedia.com_75b576cf163c631e6bfc9ada9cbc26d7.html.txt: /*<![CDATA[*//*---->*/
./75/encyclopedia.com_75b576cf163c631e6bfc9ada9cbc26d7.html.txt:<script type="text/javascript">/*<![CDATA[*//*---->*/var OmnitureSuite = 'highbeamency'; /*--*//*]]>*/</script>
./75/encyclopedia.com_75b576cf163c631e6bfc9ada9cbc26d7.html.txt: /*<![CDATA[*//*---->*/
./7e/hostednumbers.com_7e5c35459fe4870318288b5c9df427fd.html.txt: <!-- //--><![CDATA[//><!--
./7e/hostednumbers.com_7e5c35459fe4870318288b5c9df427fd.html.txt: /* <![CDATA[ */
./7e/hostednumbers.com_7e5c35459fe4870318288b5c9df427fd.html.txt: /* <![CDATA[ */
./7e/hostednumbers.com_7e5c35459fe4870318288b5c9df427fd.html.txt: /* <![CDATA[ */
./7e/hostednumbers.com_7e5c35459fe4870318288b5c9df427fd.html.txt: /* <![CDATA[ */
./7e/hostednumbers.com_7e5c35459fe4870318288b5c9df427fd.html.txt: /* <![CDATA[ */
./7e/hostednumbers.com_7e5c35459fe4870318288b5c9df427fd.html.txt: /* <![CDATA[ */
./7f/bytbil.com_7fb88ea6cbaa4ce9373475de3b3bfad8.html.txt://<![CDATA[
./7f/bytbil.com_7fb88ea6cbaa4ce9373475de3b3bfad8.html.txt://<![CDATA[
./7f/bytbil.com_7fb88ea6cbaa4ce9373475de3b3bfad8.html.txt://<![CDATA[
./7f/bytbil.com_7fb88ea6cbaa4ce9373475de3b3bfad8.html.txt://<![CDATA[
./7f/bytbil.com_7fb88ea6cbaa4ce9373475de3b3bfad8.html.txt: //<![CDATA[
./7f/bytbil.com_7fb88ea6cbaa4ce9373475de3b3bfad8.html.txt://<![CDATA[
./7f/bytbil.com_7fb88ea6cbaa4ce9373475de3b3bfad8.html.txt://<![CDATA[
./7f/bytbil.com_7fb88ea6cbaa4ce9373475de3b3bfad8.html.txt://<![CDATA[
./7f/bytbil.com_7fb88ea6cbaa4ce9373475de3b3bfad8.html.txt://<![CDATA[
./8b/warcraftchina.com_8b6f3155094707822000357d98dc8d5a.html.txt: //<![CDATA[
./8b/warcraftchina.com_8b6f3155094707822000357d98dc8d5a.html.txt: //<![CDATA[
./a4/vivantabytaj.com_a4f318fbcbdcaacf642705f5aae17f42.html.txt: //<![CDATA[
./a4/vivantabytaj.com_a4f318fbcbdcaacf642705f5aae17f42.html.txt: //<![CDATA[
./a4/vivantabytaj.com_a4f318fbcbdcaacf642705f5aae17f42.html.txt:/* <![CDATA[ */
./af/kau.edu.sa_afe10435eab08898cc251cb4a91c6179.html.txt://<![CDATA[
./af/kau.edu.sa_afe10435eab08898cc251cb4a91c6179.html.txt://<![CDATA[
./bf/merrilledge.com_bf014c995945ddadc7bd3067bd063880.html.txt://<![CDATA[
./bf/merrilledge.com_bf014c995945ddadc7bd3067bd063880.html.txt: <!-- end content generated by genericTransform.xsl --><script type="text/javascript">/* <![CDATA[ */ function ctl00_MainContent_SimpleContentControl1_ctl00_ecmsWPRECMSContentCtrl1_CCM_Callback(specparams){ var paramChanged=false; for (i=0; i<specparams.length; i++){ if (specparams[i][0]=='location'){ ctl00_MainContent_SimpleContentControl1_ctl00_ecmsWPRECMSContentCtrl1_SPEC_PARAM_ARY[specparams[i][0]] = specparams[i][0]+'|'+specparams[i][1]; paramChanged=true;} } if (paramChanged==true) {var obj = $find('ctl00_MainContent_SimpleContentControl1_ctl00_ecmsWPRECMSContentCtrl1'); obj.setRuntimeXMLParameters(FormatSpecParams('zone', ctl00_MainContent_SimpleContentControl1_ctl00_ecmsWPRECMSContentCtrl1_SPEC_PARAM_ARY)); obj.LoadContent(); } } Sys.Application.add_load(ctl00_MainContent_SimpleContentControl1_ctl00_ecmsWPRECMSContentCtrl1_ControlSubscriber); function ctl00_MainContent_SimpleContentControl1_ctl00_ecmsWPRECMSContentCtrl1_ControlSubscriber() { ControlManager.subscribe('ctl00_MainContent_SimpleContentControl1_ctl00_ecmsWPRECMSContentCtrl1', ctl00_MainContent_SimpleContentControl1_ctl00_ecmsWPRECMSContentCtrl1_CCM_Callback); } var ctl00_MainContent_SimpleContentControl1_ctl00_ecmsWPRECMSContentCtrl1_SPEC_PARAM_ARY = new Array(); ctl00_MainContent_SimpleContentControl1_ctl00_ecmsWPRECMSContentCtrl1_SPEC_PARAM_ARY['location']='location_id|phZone1'; /* ]]> */</script>
./bf/merrilledge.com_bf014c995945ddadc7bd3067bd063880.html.txt:<script type="text/javascript">/* <![CDATA[ */ function ctl00_MainContent_SimpleContentControl2_ctl00_ecmsWPRECMSContentCtrl1_CCM_Callback(specparams){ var paramChanged=false; for (i=0; i<specparams.length; i++){ if (specparams[i][0]=='location'){ ctl00_MainContent_SimpleContentControl2_ctl00_ecmsWPRECMSContentCtrl1_SPEC_PARAM_ARY[specparams[i][0]] = specparams[i][0]+'|'+specparams[i][1]; paramChanged=true;} } if (paramChanged==true) {var obj = $find('ctl00_MainContent_SimpleContentControl2_ctl00_ecmsWPRECMSContentCtrl1'); obj.setRuntimeXMLParameters(FormatSpecParams('zone', ctl00_MainContent_SimpleContentControl2_ctl00_ecmsWPRECMSContentCtrl1_SPEC_PARAM_ARY)); obj.LoadContent(); } } Sys.Application.add_load(ctl00_MainContent_SimpleContentControl2_ctl00_ecmsWPRECMSContentCtrl1_ControlSubscriber); function ctl00_MainContent_SimpleContentControl2_ctl00_ecmsWPRECMSContentCtrl1_ControlSubscriber() { ControlManager.subscribe('ctl00_MainContent_SimpleContentControl2_ctl00_ecmsWPRECMSContentCtrl1', ctl00_MainContent_SimpleContentControl2_ctl00_ecmsWPRECMSContentCtrl1_CCM_Callback); } var ctl00_MainContent_SimpleContentControl2_ctl00_ecmsWPRECMSContentCtrl1_SPEC_PARAM_ARY = new Array(); ctl00_MainContent_SimpleContentControl2_ctl00_ecmsWPRECMSContentCtrl1_SPEC_PARAM_ARY['location']='location_id|phZone2'; /* ]]> */</script>
./bf/merrilledge.com_bf014c995945ddadc7bd3067bd063880.html.txt: <!-- end content generated by genericTransform.xsl --><script type="text/javascript">/* <![CDATA[ */ function ctl00_MainContent_SimpleContentControl3_ctl00_ecmsWPRECMSContentCtrl1_CCM_Callback(specparams){ var paramChanged=false; for (i=0; i<specparams.length; i++){ if (specparams[i][0]=='location'){ ctl00_MainContent_SimpleContentControl3_ctl00_ecmsWPRECMSContentCtrl1_SPEC_PARAM_ARY[specparams[i][0]] = specparams[i][0]+'|'+specparams[i][1]; paramChanged=true;} } if (paramChanged==true) {var obj = $find('ctl00_MainContent_SimpleContentControl3_ctl00_ecmsWPRECMSContentCtrl1'); obj.setRuntimeXMLParameters(FormatSpecParams('zone', ctl00_MainContent_SimpleContentControl3_ctl00_ecmsWPRECMSContentCtrl1_SPEC_PARAM_ARY)); obj.LoadContent(); } } Sys.Application.add_load(ctl00_MainContent_SimpleContentControl3_ctl00_ecmsWPRECMSContentCtrl1_ControlSubscriber); function ctl00_MainContent_SimpleContentControl3_ctl00_ecmsWPRECMSContentCtrl1_ControlSubscriber() { ControlManager.subscribe('ctl00_MainContent_SimpleContentControl3_ctl00_ecmsWPRECMSContentCtrl1', ctl00_MainContent_SimpleContentControl3_ctl00_ecmsWPRECMSContentCtrl1_CCM_Callback); } var ctl00_MainContent_SimpleContentControl3_ctl00_ecmsWPRECMSContentCtrl1_SPEC_PARAM_ARY = new Array(); ctl00_MainContent_SimpleContentControl3_ctl00_ecmsWPRECMSContentCtrl1_SPEC_PARAM_ARY['location']='location_id|phZone3'; /* ]]> */</script>
./bf/merrilledge.com_bf014c995945ddadc7bd3067bd063880.html.txt:<script type="text/javascript">/* <![CDATA[ */ function ctl00_MainContent_SimpleContentControl4_ctl00_ecmsWPRECMSContentCtrl1_CCM_Callback(specparams){ var paramChanged=false; for (i=0; i<specparams.length; i++){ if (specparams[i][0]=='location'){ ctl00_MainContent_SimpleContentControl4_ctl00_ecmsWPRECMSContentCtrl1_SPEC_PARAM_ARY[specparams[i][0]] = specparams[i][0]+'|'+specparams[i][1]; paramChanged=true;} } if (paramChanged==true) {var obj = $find('ctl00_MainContent_SimpleContentControl4_ctl00_ecmsWPRECMSContentCtrl1'); obj.setRuntimeXMLParameters(FormatSpecParams('zone', ctl00_MainContent_SimpleContentControl4_ctl00_ecmsWPRECMSContentCtrl1_SPEC_PARAM_ARY)); obj.LoadContent(); } } Sys.Application.add_load(ctl00_MainContent_SimpleContentControl4_ctl00_ecmsWPRECMSContentCtrl1_ControlSubscriber); function ctl00_MainContent_SimpleContentControl4_ctl00_ecmsWPRECMSContentCtrl1_ControlSubscriber() { ControlManager.subscribe('ctl00_MainContent_SimpleContentControl4_ctl00_ecmsWPRECMSContentCtrl1', ctl00_MainContent_SimpleContentControl4_ctl00_ecmsWPRECMSContentCtrl1_CCM_Callback); } var ctl00_MainContent_SimpleContentControl4_ctl00_ecmsWPRECMSContentCtrl1_SPEC_PARAM_ARY = new Array(); ctl00_MainContent_SimpleContentControl4_ctl00_ecmsWPRECMSContentCtrl1_SPEC_PARAM_ARY['location']='location_id|phZone4'; /* ]]> */</script>
./bf/merrilledge.com_bf014c995945ddadc7bd3067bd063880.html.txt: <!-- end content generated by genericTransform.xsl --><script type="text/javascript">/* <![CDATA[ */ function ctl00_MainContent_SimpleContentControl5_ctl00_ecmsWPRECMSContentCtrl1_CCM_Callback(specparams){ var paramChanged=false; for (i=0; i<specparams.length; i++){ if (specparams[i][0]=='location'){ ctl00_MainContent_SimpleContentControl5_ctl00_ecmsWPRECMSContentCtrl1_SPEC_PARAM_ARY[specparams[i][0]] = specparams[i][0]+'|'+specparams[i][1]; paramChanged=true;} } if (paramChanged==true) {var obj = $find('ctl00_MainContent_SimpleContentControl5_ctl00_ecmsWPRECMSContentCtrl1'); obj.setRuntimeXMLParameters(FormatSpecParams('zone', ctl00_MainContent_SimpleContentControl5_ctl00_ecmsWPRECMSContentCtrl1_SPEC_PARAM_ARY)); obj.LoadContent(); } } Sys.Application.add_load(ctl00_MainContent_SimpleContentControl5_ctl00_ecmsWPRECMSContentCtrl1_ControlSubscriber); function ctl00_MainContent_SimpleContentControl5_ctl00_ecmsWPRECMSContentCtrl1_ControlSubscriber() { ControlManager.subscribe('ctl00_MainContent_SimpleContentControl5_ctl00_ecmsWPRECMSContentCtrl1', ctl00_MainContent_SimpleContentControl5_ctl00_ecmsWPRECMSContentCtrl1_CCM_Callback); } var ctl00_MainContent_SimpleContentControl5_ctl00_ecmsWPRECMSContentCtrl1_SPEC_PARAM_ARY = new Array(); ctl00_MainContent_SimpleContentControl5_ctl00_ecmsWPRECMSContentCtrl1_SPEC_PARAM_ARY['location']='location_id|phZone5'; /* ]]> */</script>
./bf/merrilledge.com_bf014c995945ddadc7bd3067bd063880.html.txt://<![CDATA[
./bf/merrilledge.com_bf014c995945ddadc7bd3067bd063880.html.txt://<![CDATA[
./d1/susquehanna.net_d143c161241034676102e8463c970e1f.html.txt://<![CDATA[
./d1/susquehanna.net_d143c161241034676102e8463c970e1f.html.txt: //<![CDATA[
./d1/susquehanna.net_d143c161241034676102e8463c970e1f.html.txt: //<![CDATA[
./d1/susquehanna.net_d143c161241034676102e8463c970e1f.html.txt: //<![CDATA[
./d1/susquehanna.net_d143c161241034676102e8463c970e1f.html.txt://<![CDATA[
./e7/ecp.gov.pk_e7ef500ab20ffb7479edb18ed0ce81d8.html.txt://<![CDATA[
./f0/standardbank.com_f0a2c331f94a37ddfc1daf829798a932.html.txt://<![CDATA[
./f3/thonky.com_f308cc4e2b05f504d3e8cb019ac6a551.html.txt://<![CDATA[
@zcorpan
Copy link

zcorpan commented Dec 1, 2014

Data set: http://webdevdata.org data set 2013-09-01 102,000 pages.

Method:

$ find . -type f -print0 | xargs -0 -P 4 -n 40 grep -E "<\!\[CDATA\[" >> ../cdata.txt
$ find . -name "*.html.txt.hdr.txt" -type f -print0 | xargs -0 -P 4 -n 40 grep -Ei "content-type\s*:\s*application/xhtml\+xml" >> ../xhtml.txt
$ cd ..
# manual search-replace of xhtml.txt to produce:
$ grep -E "(secure-booker\.com|accessmylibrary\.com|hindawi\.com|bancociudad\.com\.ar|greendot\.com|nsu\.ru|ighome\.com|walmartmoneycard\.com|agweb\.com|rateitall\.com|bayikanali\.com|onetokyo\.org|tajhotels\.com|tileshop\.com|mobi-book\.com|guiadinmuebles\.com|pfchangs\.com|ielts\.org|encyclopedia\.com|hostednumbers\.com|bytbil\.com|merrell\.com|warcraftchina\.com|peiwei\.com|alapetite\.fr|vivantabytaj\.com|appointy\.com|kau\.edu\.sa|myjetbrains\.com|merrilledge\.com|mcldaz\.org|susquehanna\.net|marcustheatres\.com|hostknox\.com|ecp\.gov\.pk|standardbank\.com|thonky\.com|regions\.com)" cdata.txt > cdata+xhtml.txt

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment