Skip to content

Instantly share code, notes, and snippets.

@nyilmaz
Created May 21, 2014 10:09
Show Gist options
  • Save nyilmaz/7cb2c5238a0ad4da7adc to your computer and use it in GitHub Desktop.
Save nyilmaz/7cb2c5238a0ad4da7adc to your computer and use it in GitHub Desktop.
package match.mycron.web.job.crawler;
import com.google.common.collect.Maps;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpUriRequest;
import org.apache.http.client.methods.RequestBuilder;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.util.StringUtils;
import java.io.IOException;
import java.util.Map;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
/**
* @author nyilmaz
*/
public class doktorsitesicrawler {
public static void main(String[] args) throws IOException, InterruptedException {
ExecutorService executorService = Executors.newFixedThreadPool(10);
PoolingHttpClientConnectionManager connectionManager = new PoolingHttpClientConnectionManager();
connectionManager.setDefaultMaxPerRoute(1000);
connectionManager.setMaxTotal(1000);
final CloseableHttpClient client = HttpClients.custom().setConnectionManager(connectionManager).build();
final Map<String, String> doktormap = Maps.newConcurrentMap();
StringBuffer stringBuffer = new StringBuffer();
for(int i = 1; i < 18; i++) {
HttpUriRequest request = RequestBuilder
.get()
.setUri("http://www.doktorsitesi.com/tumuzmanlar")
.addParameter("sayfa", "" + i)
.build();
CloseableHttpResponse response = client.execute(request);
String html = EntityUtils.toString(response.getEntity(), "ISO-8859-9");
response.close();
Document doc = Jsoup.parse(html);
Elements elems = doc.select(".wrapper ul li a");
for(Element elem : elems) {
executorService.submit(new mythread(client, doktormap, elem, stringBuffer));
}
}
executorService.shutdown();
executorService.awaitTermination(Long.MAX_VALUE, TimeUnit.NANOSECONDS);
client.close();
connectionManager.close();
System.out.println(stringBuffer.toString());
}
static class mythread extends Thread {
CloseableHttpClient client;
Map<String, String> doktormap;
Element elem;
StringBuffer stringBuffer;
mythread(CloseableHttpClient client, Map<String, String> doktormap, Element elem, StringBuffer stringBuffer) {
this.client = client;
this.doktormap = doktormap;
this.elem = elem;
this.stringBuffer = stringBuffer;
}
public void run() {
HttpUriRequest doktorreq = RequestBuilder.get().setUri("http://www.doktorsitesi.com" + elem.attr("href")).build();
try {
CloseableHttpResponse doktorresp = client.execute(doktorreq);
String doktorhtml = EntityUtils.toString(doktorresp.getEntity(), "UTF-8");
Document doktorsayfa = Jsoup.parse(doktorhtml);
String phone = doktorsayfa.select("[itemprop=telephone]").html();
if(StringUtils.hasText(phone)) {
String address = doktorsayfa.select(".locality").html();
String branch = doktorsayfa.select(".title").eq(0).html();
synchronized (stringBuffer) {
String str = elem.html();
stringBuffer
.append(convertUTF8(branch))
.append(" - ")
.append(convertUTF8(str))
.append(" - ")
.append(phone)
.append(" - ")
.append(convertUTF8(address))
.append("\n");
}
}
doktorresp.close();
} catch(IOException e) {
e.printStackTrace();
}
}
}
private static String convertUTF8(String str) {
return str
.replace("&Uuml;", "Ü")
.replace("&uuml;", "ü")
.replace("&Ouml;", "Ö")
.replace("&ccedil;", "ç")
.replace("&Ccedil;", "Ç")
.replace("&ouml;", "ö");
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment