Created
May 17, 2015 16:16
-
-
Save loopiezlol/d8e2923d2ceb6a314858 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import org.jsoup.Jsoup; | |
import org.jsoup.nodes.Document; | |
import org.jsoup.nodes.Element; | |
import org.jsoup.nodes.Node; | |
import org.jsoup.select.Elements; | |
import java.io.IOException; | |
public class Main { | |
public static void main(String[] args) throws IOException { | |
Document doc3 = Jsoup.connect("http://www.sfatulmedicului.ro/medicamente").get(); | |
Element letters = doc3.select("div.container-insiruire").first(); | |
Elements letters_links = letters.select("li a"); | |
for(Element element: letters_links){ | |
String s= element.attr("abs:href"); | |
if(!(s.equals("http://www.sfatulmedicului.ro/medicamente_j") || | |
s.equals("http://www.sfatulmedicului.ro/medicamente_w") || | |
s.equals("http://www.sfatulmedicului.ro/medicamente_y") || | |
s.equals("http://www.sfatulmedicului.ro/medicamente_q"))) | |
{ | |
int nr = getLastPage(s); | |
for (int i =1 ; i<=nr;i++){ | |
String page = s+ "_pagina_" + String.valueOf(i); | |
Document doc1 = Jsoup.connect(page).get(); | |
Element list_container = doc1.select("div.listare_index").first(); | |
list_container.select("div.paginatie").remove(); | |
Elements links = list_container.select("li a"); | |
for(Element element1 : links){ | |
getInfo(element1.attr("abs:href")); | |
} | |
} | |
} | |
else { | |
String page = s; | |
Document doc1 = Jsoup.connect(page).get(); | |
Element list_container = doc1.select("div.listare_index").first(); | |
list_container.select("div.paginatie").remove(); | |
Elements links = list_container.select("li a"); | |
for(Element element1 : links){ | |
getInfo(element1.attr("abs:href")); | |
} | |
} | |
} | |
} | |
private static int getLastPage(String page) throws IOException { | |
Document doc2 = Jsoup.connect(page).get(); | |
Element pags = doc2.select("div.listare_index").first().select("div.paginatie").select("a").last(); | |
String mylink = pags.attr("abs:href"); | |
String number = new StringBuilder(mylink).reverse().toString(); | |
number = number.substring(0, number.indexOf("_")); | |
number = new StringBuilder(number).reverse().toString(); | |
return Integer.parseInt(number); | |
} | |
private static void getInfo(String last_link) throws IOException { | |
Document doc = Jsoup.connect(last_link).get(); | |
Element body = doc.body(); | |
Element div1 = body.select("div#wrapper.wrapper").first(); | |
Element div2 = div1.select("div#container.container").first(); | |
Element content = div2.select("div.content").first(); | |
Elements articol = content.select("div.col-alpha"); | |
Element articol2 =articol.select("div.content-articol").first(); | |
articol2.select("div.rating_medicament").first().remove(); | |
articol2.select("a").remove(); | |
removeComments(articol2); | |
String titlu = doc.title(); | |
//TITLU E NUMELE MEDICAMENTULUI | |
System.out.println(titlu); | |
String prospect = articol2.text(); | |
//PROSPECT E STRINGUL HTML FORMATTED | |
System.out.println(prospect); | |
} | |
private static void removeComments(Node node) { | |
for (int i = 0; i < node.childNodes().size();) { | |
Node child = node.childNode(i); | |
if (child.nodeName().equals("#comment")) child.remove(); | |
else { | |
removeComments(child); | |
i++; | |
} | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment