Skip to content

Instantly share code, notes, and snippets.

@Jire
Created December 18, 2019 22:51
Show Gist options
  • Save Jire/de02b904437c3f2d370f7a60bfa1a8c1 to your computer and use it in GitHub Desktop.
Save Jire/de02b904437c3f2d370f7a60bfa1a8c1 to your computer and use it in GitHub Desktop.
import java.io.FileOutputStream
import java.nio.file.Paths
class Agent(var id1: String, val id2: String, val id: String, val email: String, val urls: MutableSet<String> = mutableSetOf())
fun main() {
val fileOne = Paths.get("original.txt").toFile().readLines()
val fileTwo = Paths.get("scraped.txt").toFile().readLines()
val agents = mutableListOf<Agent>()
var count = 0
for (line in fileOne) {
if (line.startsWith("Caller")) {
continue
}
val split = line.split('\t')
agents += Agent(split[0], split[1], split[2].substringBefore('#'), split[3])
}
for (agent in agents) {
if (agents.any { agent.id1 == it.id || agent.id1 == it.id2 }) {
agent.id1 = "REMOVED "
count++
}
}
val duplicates = mutableSetOf<String>()
for (line in fileTwo) {
if (line.startsWith("number")) {
continue
}
val split = line.split('\t')
// fetch the first agent by its id, if it doesn't exist then just skip
val agent = agents.find { it.id == split[0] } ?: continue
agent.urls.add(split[1])
// mark the id as being a duplicate
duplicates.add(split[0])
}
var count2 = 0
val output1 = Paths.get("duplicates.txt").toFile()
val output2 = Paths.get("parsed.txt").toFile()
val output1Stream = FileOutputStream(output1, true)
val output2Stream = FileOutputStream(output2, true)
try {
for (agent in agents) {
if (agent.id !in duplicates) {
output2Stream.write("${agent.id1}\t${agent.id2}\t${agent.id}\t${agent.email}\t${agent.urls.joinToString(",")}\n".toByteArray())
continue
}
count2++
output1Stream.write("${agent.id1}\t${agent.id2}\t${agent.id}\t${agent.email}\t\n\n${agent.urls.joinToString("\n")}\n\n".toByteArray())
}
} finally {
output1Stream.close()
output2Stream.close()
}
println("Number of de-duped: $count")
println("Number of agents with property listings found: $count2")
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment