Melvin Wevers melvinwevers

## trove_newspapers.py
import urllib
import requests
from lxml import etree
import xml.etree.ElementTree as ET

# Use one of these lists or visit https://gist.github.com/mhbeals/1ad7cd04ca0f8fd74e12f6151664873e for full listing
list_pre_1840 = [3,1046,1047,4,5,50,273,22,23,76,1230,19,24,1282,1235,693,95,272,1236,695,696,1237,37,525,944,6,1233,1239,869,694,945,1240,1242,66,170,1013,1238,40,1142,1232,1241,1329,935,171,96,1137,936,1243,41,20,48,1231,984]
list_1840 = [1330,1030,1331,986,181,21,1037,339,292,1014,1015,867,1336,1027,1012,1022,1036,74,1026,1025,1039,35,110,8,1033,1028,1035,190,821,1040,1034,78,1020,1018,1023,1038,1031,1021,690,178,1234,1011,172,58,1029,1016,285,160,1024,1004,1017,1138,937,1019,1032,941,18,14,94,863,864,987,284,26,54,364,13,1100,938,939,55]
list_1850 = [1244,314,464,67,1139,277,56,1041,283,174,805,107,1339,582,31,809,326,1245,948,213,180,874,346,189,484,262,161,669,985,706,193,685,162,496,959,32,1324,33,163,1246,257,1247,7,287,1053,382,380,164,558,1248,104,1054,65,381,353]
list_1860 = [365,415,927,478,10,6

## download-aww-text.md

      
        
          
            
              
              1 file
            
          
          
            
              
              0 forks
            
          
          
            
              
              0 comments
            
          
          
            
              
              1 star
            
          
        
        
          
              
          
          
            
                wragge
                / download-aww-text.md
            
            
              Last active
              April 9, 2018 12:14
            
          
        
      
        
  
      
    Getting the text content of articles from the Australian Womens Weekly

The TroveHarvester makes it easy to download articles in bulk from Trove's digitised newspapers. Using the --text option you can also save the fulltext content of every article.
However, this doesn't work for the Australian Womens' Weekly as the full text is not available through the Trove API. Fortunately, the article text can be downloaded from the web interface.
The one-line script below uses wget, so make sure you have it installed before you go any further. (You can install it with Homebrew if you're using a Mac.)
Instructions
	import urllib
	import requests
	from lxml import etree
	import xml.etree.ElementTree as ET

	# Use one of these lists or visit https://gist.github.com/mhbeals/1ad7cd04ca0f8fd74e12f6151664873e for full listing
	list_pre_1840 = [3,1046,1047,4,5,50,273,22,23,76,1230,19,24,1282,1235,693,95,272,1236,695,696,1237,37,525,944,6,1233,1239,869,694,945,1240,1242,66,170,1013,1238,40,1142,1232,1241,1329,935,171,96,1137,936,1243,41,20,48,1231,984]
	list_1840 = [1330,1030,1331,986,181,21,1037,339,292,1014,1015,867,1336,1027,1012,1022,1036,74,1026,1025,1039,35,110,8,1033,1028,1035,190,821,1040,1034,78,1020,1018,1023,1038,1031,1021,690,178,1234,1011,172,58,1029,1016,285,160,1024,1004,1017,1138,937,1019,1032,941,18,14,94,863,864,987,284,26,54,364,13,1100,938,939,55]
	list_1850 = [1244,314,464,67,1139,277,56,1041,283,174,805,107,1339,582,31,809,326,1245,948,213,180,874,346,189,484,262,161,669,985,706,193,685,162,496,959,32,1324,33,163,1246,257,1247,7,287,1053,382,380,164,558,1248,104,1054,65,381,353]
	list_1860 = [365,415,927,478,10,6