Last active
October 8, 2023 13:04
-
-
Save iamandrewluca/ae1cc8dcca2259b0c18d731b10d3c958 to your computer and use it in GitHub Desktop.
old https://www.yellowpages.md parser
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import org.apache.commons.csv.CSVFormat; | |
import org.apache.commons.csv.CSVPrinter; | |
import org.apache.commons.csv.CSVRecord; | |
import org.jsoup.Jsoup; | |
import org.jsoup.nodes.Document; | |
import org.jsoup.nodes.Element; | |
import org.jsoup.select.Elements; | |
import java.io.*; | |
import java.net.*; | |
import java.util.ArrayList; | |
import java.util.List; | |
public class Main { | |
static String ypMain = "http://yp.md"; | |
static String ypLink = ypMain+ "/rom/search/companies?page="; | |
private static final String NEW_LINE_SEPARATOR = "\n"; | |
private static final Object [] FILE_HEADER = {"name","site","email","phones","categories", "logo", "address"}; | |
public static void main(String[] args) { | |
if (args.length != 1) { | |
System.out.println("need filename parameter"); | |
return; | |
} | |
FileWriter writer = null; | |
CSVPrinter csvPrinter = null; | |
CSVFormat csvFormat = CSVFormat.DEFAULT.withRecordSeparator(NEW_LINE_SEPARATOR); | |
try { | |
writer = new FileWriter(args[0]); | |
csvPrinter = new CSVPrinter(writer, csvFormat); | |
csvPrinter.printRecord(FILE_HEADER); | |
boolean existsCompanies = true; | |
int i = 0; | |
do { | |
String currentPage = ypLink + i; | |
try { | |
Document doc = null; | |
try { | |
doc = Jsoup.connect(currentPage).get(); | |
System.out.println("Page " + i + " loaded"); | |
} catch (Exception e) { | |
System.out.println("Page " + i + " could not load. Retrying"); | |
continue; | |
} | |
Elements companies = doc.select(".pane-content .view-content .views-row"); | |
if (companies.size() != 0) { | |
for (Element element : companies) { | |
String name = ""; | |
String address = ""; | |
String phones = ""; | |
String site = ""; | |
String email = ""; | |
String categories = ""; | |
String logo = ""; | |
String companyLink = element.select(".content .right .node-title a").attr("href"); | |
Document companyDoc = null; | |
try { | |
companyDoc = Jsoup.connect(ypMain + companyLink).get(); | |
} catch (Exception e) { | |
System.out.println("Company " + companyLink + " could not load first attemp"); | |
try { | |
companyDoc = Jsoup.connect(ypMain + companyLink).get(); | |
} catch (Exception ex) { | |
System.out.println("Company " + companyLink + " could not load second attemp"); | |
continue; | |
} | |
} | |
Element companyContent = companyDoc.select(".company-content .col-company").get(1); | |
try { | |
logo = companyDoc.select(".company-content .col-company-logo img").get(0).attr("src"); | |
} catch (Exception e) { | |
// e.printStackTrace(); | |
} | |
try { | |
name = companyContent.select("h2").get(0).text(); | |
} catch (Exception e) { | |
// e.printStackTrace(); | |
} | |
Elements categoriesTags = companyContent.select(".field-row-tags ul li a"); | |
for (Element category : categoriesTags) { | |
categories += category.text() + ", "; | |
} | |
try { | |
site = companyContent.select(".field-name-field-company-site a").get(0).attr("href"); | |
} catch (Exception e) { | |
// e.printStackTrace(); | |
} | |
try { | |
email = companyContent.select(".field-name-field-email a").get(0).attr("href"); | |
email = email.replace("mailto:", ""); | |
} catch (Exception e) { | |
// e.printStackTrace(); | |
} | |
Elements mobilePhones = companyContent.select(".field-name-field-landline-phone .field-item"); | |
for (Element phone : mobilePhones) { | |
phones += phone.text() + ", "; | |
} | |
Elements landlinePhones = companyContent.select(".field-name-field-mobile-phone .field-item"); | |
for (Element phone : landlinePhones) { | |
phones += phone.text() + ", "; | |
} | |
String street = ""; | |
try { | |
street = companyContent.select(".field-name-field-address .street-block .thoroughfare").get(0).text(); | |
} catch (Exception e) { | |
// e.printStackTrace(); | |
} | |
String block = ""; | |
try { | |
block = companyContent.select(".field-name-field-address .street-block .premise").get(0).text(); | |
} catch (Exception e) { | |
// e.printStackTrace(); | |
} | |
String postal = ""; | |
try { | |
postal = companyContent.select(".field-name-field-address .locality-block .postal-code").get(0).text(); | |
} catch (Exception e) { | |
// e.printStackTrace(); | |
} | |
String locality = ""; | |
try { | |
locality = companyContent.select(".field-name-field-address .locality-block .locality").get(0).text(); | |
} catch (Exception e) { | |
// e.printStackTrace(); | |
} | |
address = street + " " + block + ", " + postal + " " + locality; | |
System.out.println(i + ": " + name); | |
// System.out.println(address); | |
// System.out.println(phones); | |
// System.out.println(site); | |
// System.out.println(email); | |
// System.out.println(categories); | |
// System.out.println(logo); | |
List<String> companyData = new ArrayList<String>(); | |
companyData.add(name); | |
companyData.add(site); | |
companyData.add(email); | |
companyData.add(phones); | |
companyData.add(categories); | |
companyData.add(logo); | |
companyData.add(address); | |
csvPrinter.printRecord(companyData); | |
try { | |
Thread.sleep(500); | |
} catch (InterruptedException e) { | |
e.printStackTrace(); | |
} | |
} | |
} else { | |
existsCompanies = false; | |
} | |
} catch (IOException e) { | |
e.printStackTrace(); | |
existsCompanies = false; | |
} | |
i++; | |
} while (existsCompanies); | |
} catch (IOException e) { | |
e.printStackTrace(); | |
} finally { | |
try { | |
writer.flush(); | |
writer.close(); | |
csvPrinter.close(); | |
} catch (IOException e) { | |
e.printStackTrace(); | |
} | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment