在进行网络数据抓取抓取时,使用requests可以直接处理json格式的返回结果,对于xml格式的返回结果requests并没有提供直接的支持。python内建了xml解析器,下面的示例说明两种情况下对xml的解析。
from xml.etree import ElementTree
xml_file = r'D:\BJMapSearch.xml'
try:
tree = ElementTree.parse(xml_file)
root = tree.getroot() # 获取根节点
except Exception, e:
print '解析xml文件出错'
return -1
# 对结点进行查询
data_node = root.find("layer").find("hts").findall("ht")
for node in data_node:
if u'关联关系' in node.attrib.keys():
print node.attrib.get(u'关联关系')
# xml结构如下
<Response>
<count>10</count>
<total>4706</total>
<actualtotal>4706</actualtotal>
<layer id="L10319" type="点">...</layer>
</Response>
# layer结点的结构如下
<hts Sum="10">
<ht></ht>
<ht></ht>
...
</hts>
import requests
from xml.etree import ElementTree
xml_file = requests.get('http://www.beijingmap.gov.cn/bjgtj/BJMapSearch?p=0%2C10&s=%2A&l=L10319&t=xml')
# fromstring方法直接返回root结点
root = ElementTree.fromstring(xml_file.text)
data_node = root.find("layer").find("hts").findall("ht")
for node in data_node:
if u'关联关系' in node.attrib.keys():
print node.attrib.get(u'关联关系')
如果网络请求返回的结果比较大,需要使用另外的方式
response = requests.get(url, stream=True)
# if the server sent a Gzip or Deflate compressed response, decompress
# as we read the raw stream:
response.raw.decode_content = True
events = ElementTree.iterparse(response.raw)
for elem, event in events:
# do something with `elem`