Pedro Koblitz pedrokoblitz

## conjunto.php
	function conjunto($dados)
	{
		return array_map("unserialize", array_unique(array_map("serialize", $dados)));
	}

## distancia_coordenadas.sql
SELECT ((ACOS(SIN($lat * PI() / 180) * SIN(lat * PI() / 180) + COS($lat * PI() / 180) * COS(lat * PI() / 180) * COS(($lon – lon) * PI() / 180)) * 180 / PI()) * 60 * 1.1515) AS `distance` FROM `members` HAVING `distance`<=’10′ ORDER BY `distance` ASC

## aggregate-feed.php
<?php
/* Merge multiple RSS feeds with SimplePie
 *
 * Just modify the path to SimplePie and
 * modify the $feeds array with the feeds you want
 *
 * You should probably also change the channel title, link and description,
 * plus I added a CC license you may not want
 *
 * Help from: http://www.webmaster-source.com/2007/08/06/merging-rss-feeds-with-simplepie/

## notas_nlp.txt
estado
tokenstore -> testes finais

processadores -> estagios iniciais
refazer extrator regexp da aranha

colecionar codigo para filas e daemon

daemon inicia aranhas e workers
sysctl gerencia workers

## admin.sql
/*

administracao geral e configuracao de todas as utilidades da ferramentas
utilizado primariamente para definir as instruçoes de coleta de dados e separar os documentos coletados em conjuntos e subconjuntos para funcionar como base para definir um ponto de partida para cada tipo de processamento

*/

# projeto é a maior unidade possivel e indica o objetivo da coleta e processamento de documentos

CREATE TABLE projects (

## GeoConversao.php
<?php
//*************************English Description***************************//
// Class to convert Latitude/Longitude Coordinates                       //
// Developed by: Diêgo Garrido de Almeida (diego@brflog.net)             //
// Location: Conselheiro Lafaiete - Minas Gerais / Brazil                //
// License: None, this class can be used without credits                 //
// Recommended use: To convert the Google Earth standard coordinates     //
//                  to Google Maps API standard coordinates, to do this, //
//                  use the method GeoConversao::DMS2Dd.                 //
//                  eg: $GeoConversao->DMS2Dd('45º22\'38"') -> 45.3772   //

## Classifier.py
class Classificador (object):

	def __init__(self):
		tsents = mac_morpho.tagged_sents()
		tsents = [[(w.lower(),t) for (w,t) in sent] for sent in tsents if sent]
		tagger0 = nltk.DefaultTagger('N')
		tagger1 = nltk.UnigramTagger(tsents[100:], backoff=tagger0)
		self.tagger = nltk.BigramTagger(tsents[100:], backoff=tagger1)

	#classifica as palavras do texto

## Linux Bash Snippets
# recursively replace text in all files with a certain file ending

find . -type f -iname '*.html' -exec sed -i 's,href="../css/stylesheet.css",href="../../css/stylesheet.css",g' {} +


# download Springer Link Books via University Proxy and add the ".pdf" file ending

export http_proxy="http://proxy.zfn.uni-bremen.de:3128";
wget -r -l 1 --reject html,js,css,jpg,png --proxy-user STUD_IP_USERNAME --proxy-passwd STUD_IP_PASSWORD  LINK_TO_BOOK;

## UnicodeHandler.py
import Cheetah.Filters

class UnicodeHarder(Cheetah.Filters.Filter):
    def filter(self, val,
               encoding='utf8',
               str=str,
               **kw):
        """ Try our best to unicode our strings """
        if not val:
            return u''

## ajax_selenium_spider.py
# Many times when crawling we run into problems where content that is rendered on the page is generated with Javascript and therefore scrapy is unable to crawl for it (eg. ajax requests, jQuery craziness).  However, if you use Scrapy along with the web testing framework Selenium then we are able to crawl anything displayed in a normal web browser.
#
# Some things to note:
# You must have the Python version of Selenium RC installed for this to work, and you must have set up Selenium properly.  Also this is just a template crawler.  You could get much crazier and more advanced with things but I just wanted to show the basic idea.  As the code stands now you will be doing two requests for any given url.  One request is made by Scrapy and the other is made by Selenium.  I am sure there are ways around this so that you could possibly just make Selenium do the one and only request but I did not bother to implement that and by doing two requests you get to crawl the page with Scrapy too.
#
# This is quite powerful
	function conjunto($dados)
	{
	return array_map("unserialize", array_unique(array_map("serialize", $dados)));
	}
	<?php
	/* Merge multiple RSS feeds with SimplePie
	*
	* Just modify the path to SimplePie and
	* modify the $feeds array with the feeds you want
	*
	* You should probably also change the channel title, link and description,
	* plus I added a CC license you may not want
	*
	* Help from: http://www.webmaster-source.com/2007/08/06/merging-rss-feeds-with-simplepie/
	estado
	tokenstore -> testes finais

	processadores -> estagios iniciais
	refazer extrator regexp da aranha

	colecionar codigo para filas e daemon

	daemon inicia aranhas e workers
	sysctl gerencia workers
	/*

	administracao geral e configuracao de todas as utilidades da ferramentas
	utilizado primariamente para definir as instruçoes de coleta de dados e separar os documentos coletados em conjuntos e subconjuntos para funcionar como base para definir um ponto de partida para cada tipo de processamento

	*/

	# projeto é a maior unidade possivel e indica o objetivo da coleta e processamento de documentos

	CREATE TABLE projects (
	<?php
	//***********************English Description*************************//
	// Class to convert Latitude/Longitude Coordinates //
	// Developed by: Diêgo Garrido de Almeida (diego@brflog.net) //
	// Location: Conselheiro Lafaiete - Minas Gerais / Brazil //
	// License: None, this class can be used without credits //
	// Recommended use: To convert the Google Earth standard coordinates //
	// to Google Maps API standard coordinates, to do this, //
	// use the method GeoConversao::DMS2Dd. //
	// eg: $GeoConversao->DMS2Dd('45º22\'38"') -> 45.3772 //
	class Classificador (object):

	def __init__(self):
	tsents = mac_morpho.tagged_sents()
	tsents = [[(w.lower(),t) for (w,t) in sent] for sent in tsents if sent]
	tagger0 = nltk.DefaultTagger('N')
	tagger1 = nltk.UnigramTagger(tsents[100:], backoff=tagger0)
	self.tagger = nltk.BigramTagger(tsents[100:], backoff=tagger1)

	#classifica as palavras do texto
	# recursively replace text in all files with a certain file ending

	find . -type f -iname '*.html' -exec sed -i 's,href="../css/stylesheet.css",href="../../css/stylesheet.css",g' {} +



	# download Springer Link Books via University Proxy and add the ".pdf" file ending

	export http_proxy="http://proxy.zfn.uni-bremen.de:3128";
	wget -r -l 1 --reject html,js,css,jpg,png --proxy-user STUD_IP_USERNAME --proxy-passwd STUD_IP_PASSWORD LINK_TO_BOOK;
	import Cheetah.Filters

	class UnicodeHarder(Cheetah.Filters.Filter):
	def filter(self, val,
	encoding='utf8',
	str=str,
	**kw):
	""" Try our best to unicode our strings """
	if not val:
	return u''
	# Many times when crawling we run into problems where content that is rendered on the page is generated with Javascript and therefore scrapy is unable to crawl for it (eg. ajax requests, jQuery craziness). However, if you use Scrapy along with the web testing framework Selenium then we are able to crawl anything displayed in a normal web browser.
	#
	# Some things to note:
	# You must have the Python version of Selenium RC installed for this to work, and you must have set up Selenium properly. Also this is just a template crawler. You could get much crazier and more advanced with things but I just wanted to show the basic idea. As the code stands now you will be doing two requests for any given url. One request is made by Scrapy and the other is made by Selenium. I am sure there are ways around this so that you could possibly just make Selenium do the one and only request but I did not bother to implement that and by doing two requests you get to crawl the page with Scrapy too.
	#
	# This is quite powerful