Last active
August 29, 2015 14:10
-
-
Save crazy4groovy/9cc65a64601ea50150ab to your computer and use it in GitHub Desktop.
Extracts XML data into CSV format. Can especially help interpret large BISAC XML files.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* @name nodeDataExtractor.groovy | |
* @author Steven Olsen <spam2steve@gmail.com> | |
* @description Extracts XML data into CSV format. Can especially help interpret large BISAC XML files. | |
* https://www.bisg.org/complete-bisac-subject-headings-2014-edition | |
* @version 0.1 | |
*/ | |
///////////////////////////// | |
File f = new File(args.size() >= 1 ? args[0] : /C:\input.xml/) | |
String BASE_NODE = args.size() == 2 ? args[1] : 'product' | |
boolean isBisacXML = args.size() == 2 ? false : true | |
boolean truncateText = args.size() == 2 ? false : true | |
String NODE_DELIMITER = '/' | |
///////////////////////////// | |
def root = new XmlSlurper() | |
// allow top XML doctype tag to exist | |
root.setFeature("http://apache.org/xml/features/disallow-doctype-decl", false) | |
root = root.parse(f) | |
//String rootName = root.name() | |
List<Map> nodes = [] | |
Map nodeKeysCtr = [:].withDefault{0} | |
// start at BASE_NODE in the node tree | |
root[BASE_NODE].each { p -> | |
Map product = [:].withDefault{[]} | |
p.'**'.each{ n -> | |
String text = n.text(); | |
if (truncateText) text = text.contains("<") ? "...long text..." : text | |
String name = "" | |
while(n.name() != BASE_NODE) { | |
name = NODE_DELIMITER + n.name() + name | |
n = n.parent() | |
} | |
if (name) { | |
nodeKeysCtr[name]++ | |
product[name] << text | |
} | |
} | |
//println product | |
nodes << product | |
} | |
Map nodeLookup = getLookup(isBisacXML) | |
//each column is a BASE_NODE | |
println "Node,Human Name,# times used (total ${BASE_NODE}'s:${root.product.size()}),${BASE_NODE}1,${BASE_NODE}2,Etc..." | |
// each row is the prop for all BASE_NODEs | |
nodeKeysCtr.keySet().toList().sort().each { prop -> | |
if (isBisacXML && !(prop ==~ /.*\d.*/)) return // only print nodes with an int in them | |
String bareProp = prop.split(NODE_DELIMITER)[-1] | |
print "$prop,${nodeLookup[ bareProp ]},${nodeKeysCtr[prop]}," | |
nodes.each { p -> | |
print "${p[prop].toString().replaceAll('[,\n\r]',';')}," | |
} | |
} | |
//END// | |
///////////////////////////////////////////////// | |
///////////////////////////////////////////////// | |
//UTILS// | |
Map getLookup(returnVals) { | |
if (!returnVals) return ([:].withDefault{it}) | |
return ([a001:'RecordReference', | |
a002:'NotificationType', | |
a194:'RecordSourceType', | |
a195:'RecordSourceIdentifierType', | |
a196:'RecordSourceIdentifier', | |
a197:'RecordSourceName', | |
a198:'DeletionCode', | |
a199:'DeletionText', | |
a245:'SubordinateEntries', | |
b003:'PublicationDate', | |
b004:'ISBN', | |
b005:'EAN13', | |
b006:'UPC', | |
b007:'PublisherProductNo', | |
b008:'ISMN', | |
b009:'DOI', | |
b010:'ReplacesISBN', | |
b011:'ReplacesEAN13', | |
b012:'ProductForm', | |
b013:'BookFormDetail', | |
b014:'ProductFormDescription', | |
b015:'ItemQuantity', | |
b016:'SeriesISSN', | |
b017:'PublisherSeriesCode', | |
b018:'TitleOfSeries', | |
b019:'NumberWithinSeries', | |
b020:'YearOfAnnual', | |
b021:'ISBNOfSet', | |
b022:'EAN13OfSet', | |
b023:'TitleOfSet', | |
b024:'SetPartNumber', | |
b025:'SetPartTitle', | |
b026:'ItemNumberWithinSet', | |
b027:'TextCaseFlag', | |
b028:'DistinctiveTitle', | |
b029:'Subtitle', | |
b030:'TitlePrefix', | |
b031:'TitleWithoutPrefix', | |
b032:'TranslationOfTitle', | |
b033:'FormerTitle', | |
b034:'SequenceNumber', | |
b035:'ContributorRole', | |
b036:'PersonName', | |
b037:'PersonNameInverted', | |
b038:'TitlesBeforeNames', | |
b039:'NamesBeforeKey', | |
b040:'KeyNames', | |
b041:'NamesAfterKey', | |
b042:'LettersAfterNames', | |
b043:'TitlesAfterNames', | |
b044:'BiographicalNote', | |
b045:'ProfessionalPosition', | |
b046:'Affiliation', | |
b047:'CorporateName', | |
b048:'ContributorDescription', | |
b049:'ContributorStatement', | |
b050:'ConferenceDescription', | |
b051:'ConferenceRole', | |
b052:'ConferenceName', | |
b053:'ConferenceNumber', | |
b054:'ConferenceDate', | |
b055:'ConferencePlace', | |
b056:'EditionTypeCode', | |
b057:'EditionNumber', | |
b058:'EditionStatement', | |
b059:'LanguageOfText', | |
b060:'OriginalLanguage', | |
b061:'NumberOfPages', | |
b062:'IllustrationsNote', | |
b063:'MapScale', | |
b064:'BASICMainSubject', | |
b065:'BICMainSubject', | |
b066:'BICVersion', | |
b067:'SubjectSchemeIdentifier', | |
b068:'SubjectSchemeVersion', | |
b069:'SubjectCode', | |
b070:'SubjectHeadingText', | |
b071:'CorporateBodyAsSubject', | |
b072:'PlaceAsSubject', | |
b073:'AudienceCode', | |
b074:'AudienceRangeQualifier', | |
b075:'AudienceRangePrecision', | |
b076:'AudienceRangeValue', | |
b077:'ComplexitySchemeIdentifier', | |
b078:'ComplexityCode', | |
b079:'ImprintName', | |
b081:'PublisherName', | |
b083:'CountryOfPublication', | |
b084:'CopublisherName', | |
b085:'SponsorName', | |
b086:'AnnouncementDate', | |
b087:'CopyrightYear', | |
b088:'YearFirstPublished', | |
b089:'SalesRightsType', | |
b090:'RightsCountry', | |
b091:'RightsRegion', | |
b125:'NumberOfIllustrations', | |
b171:'SubjectSchemeName', | |
b189:'USSchoolGrade', | |
b190:'InterestAge', | |
b191:'MainSubjectSchemeIdentifier', | |
b200:'BASICVersion', | |
b201:'WorkIDType', | |
b202:'TitleType', | |
b203:'TitleText', | |
b204:'AudienceCodeType', | |
b205:'AudienceCodeTypeName', | |
b206:'AudienceCodeValue', | |
b207:'AudienceDescription', | |
b209:'CityOfPublication', | |
b210:'NumberOfPieces', | |
b211:'EpubType', | |
b212:'EpubTypeVersion', | |
b213:'EpubTypeDescription', | |
b214:'EpubFormat', | |
b215:'EpubFormatVersion', | |
b216:'EpubFormatDescription', | |
b217:'EditionVersionNumber', | |
b218:'ExtentType', | |
b219:'ExtentValue', | |
b220:'ExtentUnit', | |
b221:'ProductIDType', | |
b225:'ProductPackaging', | |
b233:'IDTypeName', | |
b240:'OriginalPublisher', | |
b241:'NameCodeType', | |
b242:'NameCodeTypeName', | |
b243:'NameCodeValue', | |
b244:'IDValue', | |
b246:'Barcode', | |
b247:'PrefixToKey', | |
b248:'SuffixToKey', | |
b249:'UnnamedPersons', | |
b250:'PersonNameType', | |
b251:'CountryCode', | |
b252:'LanguageCode', | |
b253:'LanguageRole', | |
b254:'PagesRoman', | |
b255:'PagesArabic', | |
b256:'IllustrationType', | |
b257:'Number', | |
b273:'SeriesIDType', | |
b274:'ProductClassificationType', | |
b275:'ProductClassificationCode', | |
b276:'AbbreviatedLength', | |
b277:'EpubTypeNote', | |
b278:'EpubSource', | |
b279:'EpubSourceVersion', | |
b280:'EpubSourceDescription', | |
b281:'SetItemTitle', | |
b282:'SeriesPartName', | |
b284:'LevelSequenceNumber', | |
b285:'TextItemIDType', | |
b286:'FirstPageNumber', | |
b287:'LastPageNumber', | |
b288:'ComponentTypeName', | |
b289:'ComponentNumber', | |
b290:'TextItemType', | |
b291:'PublishingRole', | |
b294:'WebsiteDescription', | |
b295:'WebsiteLink', | |
b305:'PersonDateRole', | |
b306:'Date', | |
b324:'StartDate', | |
b325:'EndDate', | |
b333:'ProductFormDetail', | |
b334:'ProductFormFeatureType', | |
b335:'ProductFormFeatureValue', | |
b336:'ProductFormFeatureDescription', | |
b337:'Percent', | |
b340:'SequenceNumberWithinRole', | |
b341:'ConferenceAcronym', | |
b342:'ConferenceTheme', | |
b352:'BibleContents', | |
b353:'BibleVersion', | |
b354:'BiblePurpose', | |
b355:'BibleTextOrganization', | |
b356:'BibleReferenceLocation', | |
b357:'BibleTextFeature', | |
b358:'ReligiousTextFeatureType', | |
b359:'ReligiousTextFeatureCode', | |
b360:'ReligiousTextFeatureDescript’n', | |
b361:'IllustrationTypeDescription', | |
b362:'TradeAnnouncementDate', | |
b367:'WebsiteRole', | |
b368:'ThesisType', | |
b369:'ThesisPresentedTo', | |
b370:'ThesisYear', | |
b374:'TextSourceCorporate', | |
b376:'ReligiousTextID', | |
b381:'SalesRestrictionType', | |
b382:'SalesOutletName', | |
b383:'SalesRestrictionDetail', | |
b384:'TradeCategory', | |
b385:'ProductContentType', | |
b388:'RightsTerritory', | |
b389:'StudyBibleType', | |
b390:'PersonNameIDType', | |
b391:'ConferenceSponsorIDType', | |
b392:'CopyrightOwnerIDType', | |
b393:'SalesOutletIDType', | |
b394:'PublishingStatus', | |
b395:'PublishingStatusNote', | |
b398:'RegionCode', | |
c093:'MeasureTypeCode', | |
c094:'Measurement', | |
c095:'MeasureUnitCode', | |
c096:'Height', | |
c097:'Width', | |
c098:'Thickness', | |
c099:'Weight', | |
c258:'Dimensions', | |
d100:'Annotation', | |
d101:'MainDescription', | |
d102:'TextTypeCode', | |
d103:'TextFormat', | |
d104:'Text', | |
d105:'TextLinkType', | |
d106:'TextLink', | |
d107:'TextAuthor', | |
d108:'TextSourceTitle', | |
d109:'TextPublicationDate', | |
e110:'ReviewQuote', | |
f111:'CoverImageFormatCode', | |
f112:'CoverImageLinkTypeCode', | |
f113:'CoverImageLink', | |
f114:'MediaFileTypeCode', | |
f115:'MediaFileFormatCode', | |
f116:'MediaFileLinkTypeCode', | |
f117:'MediaFileLink', | |
f118:'TextWithDownload', | |
f119:'DownloadCaption', | |
f120:'DownloadCredit', | |
f121:'DownloadCopyrightNotice', | |
f122:'DownloadTerms', | |
f123:'ProductWebsiteLink', | |
f170:'ProductWebsiteDescription', | |
f259:'ImageResolution', | |
f373:'MediaFileDate', | |
g124:'PrizesDescription', | |
g126:'PrizeName', | |
g127:'PrizeYear', | |
g128:'PrizeCountry', | |
g129:'PrizeCode', | |
g343:'PrizeJury', | |
h130:'ReplacedByISBN', | |
h131:'ReplacedByEAN13', | |
h132:'AlternativeFormatISBN', | |
h133:'AlternativeFormatEAN13', | |
h134:'OutOfPrintDate', | |
h163:'AlternativeProductISBN', | |
h164:'AlternativeProductEAN13', | |
h208:'RelationCode', | |
j135:'SupplierEANLocationNumber', | |
j136:'SupplierSAN', | |
j137:'SupplierName', | |
j138:'SupplyToCountry', | |
j139:'SupplyToRegion', | |
j140:'SupplyToCountryExcluded', | |
j141:'AvailabilityCode', | |
j142:'ExpectedShipDate', | |
j143:'OnSaleDate', | |
j144:'OrderTime', | |
j145:'PackQuantity', | |
j146:'AudienceRestrictionFlag', | |
j147:'AudienceRestrictionNote', | |
j148:'PriceTypeCode', | |
j149:'ClassOfTrade', | |
j150:'BICDiscountGroupCode', | |
j151:'PriceAmount', | |
j152:'CurrencyCode', | |
j153:'TaxRateCode1', | |
j154:'TaxRatePercent1', | |
j155:'TaxableAmount1', | |
j156:'TaxAmount1', | |
j157:'TaxRateCode2', | |
j158:'TaxRatePercent2', | |
j159:'TaxableAmount2', | |
j160:'TaxAmount2', | |
j161:'PriceEffectiveFrom', | |
j162:'PriceEffectiveUntil', | |
j192:'UnpricedItemType', | |
j239:'PricePer', | |
j260:'DateFormat', | |
j261:'PriceQualifier', | |
j262:'PriceTypeDescription', | |
j263:'MinimumOrderQuantity', | |
j264:'BatchQuantity', | |
j265:'FreeQuantity', | |
j266:'PriceStatus', | |
j267:'DiscountPercent', | |
j268:'ReturnsCodeType', | |
j269:'ReturnsCode', | |
j270:'TelephoneNumber', | |
j271:'FaxNumber', | |
j272:'EmailAddress', | |
j292:'SupplierRole', | |
j296:'StockQuantityCodeTypeName', | |
j297:'StockQuantityCode', | |
j302:'ExpectedDate', | |
j303:'Territory', | |
j304:'CountryExcluded', | |
j308:'TerritoryExcluded', | |
j345:'SupplierIDType', | |
j348:'IntermediaryAvailabilityCode', | |
j349:'LocationName', | |
j350:'OnHand', | |
j351:'OnOrder', | |
j363:'DiscountCodeType', | |
j364:'DiscountCode', | |
j365:'ReissueDate', | |
j366:'ReissueDescription', | |
j375:'CBO', | |
j377:'LocationIDType', | |
j378:'DiscountCodeTypeName', | |
j387:'LastDateForReturns', | |
j396:'ProductAvailability', | |
j397:'SupplyToTerritory', | |
j399:'SupplyRestrictionDetail', | |
j400:'AgentIDType', | |
j401:'AgentName', | |
j402:'AgentRole', | |
j403:'MarketCountry', | |
j404:'MarketTerritory', | |
j405:'MarketCountryExcluded', | |
j406:'MarketRestrictionDetail', | |
j407:'MarketPublishingStatus', | |
j408:'MarketDateRole', | |
k165:'PromotionCampaign', | |
k166:'PromotionContact', | |
k167:'InitialPrintRun', | |
k168:'CopiesSold', | |
k169:'BookClubAdoption', | |
k309:'ReprintDetail', | |
m172:'FromEANNumber', | |
m173:'FromSAN', | |
m174:'FromCompany', | |
m175:'FromPerson', | |
m176:'ToEANNumber', | |
m177:'ToSAN', | |
m178:'ToCompany', | |
m179:'ToPerson', | |
m180:'MessageNumber', | |
m181:'MessageRepeat', | |
m182:'SentDate', | |
m183:'MessageNote', | |
m184:'DefaultLanguageOfText', | |
m185:'DefaultPriceTypeCode', | |
m186:'DefaultCurrencyCode', | |
m187:'DefaultLinearUnit', | |
m188:'DefaultWeightUnit', | |
m193:'DefaultClassOfTrade', | |
m283:'FromEmail', | |
m379:'SenderIDType', | |
m380:'AddresseeIDType', | |
n338:'NoSeries', | |
n339:'NoContributor', | |
n386:'NoEdition' | |
].withDefault{it}) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment