Skip to content

Instantly share code, notes, and snippets.

@zeratax
Last active May 4, 2021 21:25
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save zeratax/2fbba7353a1adfd5d5fdee1606bee666 to your computer and use it in GitHub Desktop.
Save zeratax/2fbba7353a1adfd5d5fdee1606bee666 to your computer and use it in GitHub Desktop.
download japanese names from wikipedia
require "json"
require "uri"
require "http/client"
require "http/params"
API_ENDPOINT = URI.parse "https://en.wikipedia.org/w/api.php"
module Wiki
class Response
include JSON::Serializable
@[JSON::Field(key: "continue", root: "cmcontinue", emit_null: true)]
getter continue : String | Nil = nil
@[JSON::Field(key: "query", root: "categorymembers")]
getter members : Array(Member)
end
class Member
include JSON::Serializable
@[JSON::Field(key: "title")]
getter title : String
end
def self.get_wiki_catergory(
category : String,
output_path : Path,
property = "title",
format = "json"
)
cmcontinue = ""
url = API_ENDPOINT
members = [] of String
counter = 0
puts "starting download of #{category}"
File.open(output_path, "w") { |f| f << %Q({"names": [\n) }
loop do
params = HTTP::Params.encode ({
"action" => "query",
"list" => "categorymembers",
"cmprop" => property,
"format" => format,
"cmlimit" => "200",
"cmtitle" => category,
"cmcontinue" => cmcontinue
})
url.query = params
response = HTTP::Client.get url
unless response.status_code == 200
puts "query #{url} responded with #{response.status_code}"
end
response = Response.from_json response.body
members |= response.members.map {|member| member.title }
members_string = members[counter..members.size-1].to_s.lchop.rchop # remove []
counter = members.size-1
File.open(output_path, "a") { |f| f << members_string }
cmcontinue = response.continue
puts "continuing with #{cmcontinue}"
break if cmcontinue.nil? || cmcontinue.empty?
File.open(output_path, "a") { |f| f << ",\n" }
sleep(1)
end
File.open(output_path, "a") { |f| f << "\n]}" }
puts "written to #{output_path}"
end
end
Wiki.get_wiki_catergory(
"Category:Japanese_masculine_given_names",
Path.posix "./japanesenames.json"
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment