Skip to content

Instantly share code, notes, and snippets.

@iamandrewluca
Last active October 8, 2023 13:04
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save iamandrewluca/ae1cc8dcca2259b0c18d731b10d3c958 to your computer and use it in GitHub Desktop.
Save iamandrewluca/ae1cc8dcca2259b0c18d731b10d3c958 to your computer and use it in GitHub Desktop.
import org.apache.commons.csv.CSVFormat;
import org.apache.commons.csv.CSVPrinter;
import org.apache.commons.csv.CSVRecord;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.*;
import java.net.*;
import java.util.ArrayList;
import java.util.List;
public class Main {
static String ypMain = "http://yp.md";
static String ypLink = ypMain+ "/rom/search/companies?page=";
private static final String NEW_LINE_SEPARATOR = "\n";
private static final Object [] FILE_HEADER = {"name","site","email","phones","categories", "logo", "address"};
public static void main(String[] args) {
if (args.length != 1) {
System.out.println("need filename parameter");
return;
}
FileWriter writer = null;
CSVPrinter csvPrinter = null;
CSVFormat csvFormat = CSVFormat.DEFAULT.withRecordSeparator(NEW_LINE_SEPARATOR);
try {
writer = new FileWriter(args[0]);
csvPrinter = new CSVPrinter(writer, csvFormat);
csvPrinter.printRecord(FILE_HEADER);
boolean existsCompanies = true;
int i = 0;
do {
String currentPage = ypLink + i;
try {
Document doc = null;
try {
doc = Jsoup.connect(currentPage).get();
System.out.println("Page " + i + " loaded");
} catch (Exception e) {
System.out.println("Page " + i + " could not load. Retrying");
continue;
}
Elements companies = doc.select(".pane-content .view-content .views-row");
if (companies.size() != 0) {
for (Element element : companies) {
String name = "";
String address = "";
String phones = "";
String site = "";
String email = "";
String categories = "";
String logo = "";
String companyLink = element.select(".content .right .node-title a").attr("href");
Document companyDoc = null;
try {
companyDoc = Jsoup.connect(ypMain + companyLink).get();
} catch (Exception e) {
System.out.println("Company " + companyLink + " could not load first attemp");
try {
companyDoc = Jsoup.connect(ypMain + companyLink).get();
} catch (Exception ex) {
System.out.println("Company " + companyLink + " could not load second attemp");
continue;
}
}
Element companyContent = companyDoc.select(".company-content .col-company").get(1);
try {
logo = companyDoc.select(".company-content .col-company-logo img").get(0).attr("src");
} catch (Exception e) {
// e.printStackTrace();
}
try {
name = companyContent.select("h2").get(0).text();
} catch (Exception e) {
// e.printStackTrace();
}
Elements categoriesTags = companyContent.select(".field-row-tags ul li a");
for (Element category : categoriesTags) {
categories += category.text() + ", ";
}
try {
site = companyContent.select(".field-name-field-company-site a").get(0).attr("href");
} catch (Exception e) {
// e.printStackTrace();
}
try {
email = companyContent.select(".field-name-field-email a").get(0).attr("href");
email = email.replace("mailto:", "");
} catch (Exception e) {
// e.printStackTrace();
}
Elements mobilePhones = companyContent.select(".field-name-field-landline-phone .field-item");
for (Element phone : mobilePhones) {
phones += phone.text() + ", ";
}
Elements landlinePhones = companyContent.select(".field-name-field-mobile-phone .field-item");
for (Element phone : landlinePhones) {
phones += phone.text() + ", ";
}
String street = "";
try {
street = companyContent.select(".field-name-field-address .street-block .thoroughfare").get(0).text();
} catch (Exception e) {
// e.printStackTrace();
}
String block = "";
try {
block = companyContent.select(".field-name-field-address .street-block .premise").get(0).text();
} catch (Exception e) {
// e.printStackTrace();
}
String postal = "";
try {
postal = companyContent.select(".field-name-field-address .locality-block .postal-code").get(0).text();
} catch (Exception e) {
// e.printStackTrace();
}
String locality = "";
try {
locality = companyContent.select(".field-name-field-address .locality-block .locality").get(0).text();
} catch (Exception e) {
// e.printStackTrace();
}
address = street + " " + block + ", " + postal + " " + locality;
System.out.println(i + ": " + name);
// System.out.println(address);
// System.out.println(phones);
// System.out.println(site);
// System.out.println(email);
// System.out.println(categories);
// System.out.println(logo);
List<String> companyData = new ArrayList<String>();
companyData.add(name);
companyData.add(site);
companyData.add(email);
companyData.add(phones);
companyData.add(categories);
companyData.add(logo);
companyData.add(address);
csvPrinter.printRecord(companyData);
try {
Thread.sleep(500);
} catch (InterruptedException e) {
e.printStackTrace();
}
}
} else {
existsCompanies = false;
}
} catch (IOException e) {
e.printStackTrace();
existsCompanies = false;
}
i++;
} while (existsCompanies);
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
writer.flush();
writer.close();
csvPrinter.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment