xpath的使用

xpath高级使用

XPath轴(XPath Axes)可定义某个相对于当前节点的节点集： 
1、child 选取当前节点的所有子元素 
2、parent 选取当前节点的父节点 
3、descendant 选取当前节点的所有后代元素（子、孙等） 
4、ancestor 选取当前节点的所有先辈（父、祖父等） 
5、descendant-or-self 选取当前节点的所有后代元素（子、孙等）以及当前节点本身 
6、ancestor-or-self 选取当前节点的所有先辈（父、祖父等）以及当前节点本身 
7、preceding-sibling 选取当前节点之前的所有同级节点 
8、following-sibling 选取当前节点之后的所有同级节点 
9、preceding 选取文档中当前节点的开始标签之前的所有节点 
10、following 选取文档中当前节点的结束标签之后的所有节点 
11、self 选取当前节点 
12、attribute 选取当前节点的所有属性 
13、namespace 选取当前节点的所有命名空间节点

//选择不包含class属性的节点
var result = node.SelectNodes(".//span[not(@class)]");
//选择不包含class和id属性的节点
var result = node.SelectNodes(".//span[not(@class) and not(@id)]");
//选择不包含class="expire"的span
var result = node.SelectNodes(".//span[not(contains(@class,'expire'))]");
//选择包含class="expire"的span
var result = node.SelectNodes(".//span[contains(@class,'expire')]");

xpath定位相邻下一个或n个元素方法

#定位到id为publisher的p标签的下一个p标签
publish_date = response.xpath("//p[@id='publisher']/following-sibling::p[1]/text()").extract_first().split("：")[1]
#定位到id为publisher的p标签的下二个p标签
content_num = response.xpath("//p[@id='publisher']/following-sibling::p[2]/text()").extract_first().split("：")[1]

xpath获取不含某个子标签的div标签

<div>获取这个</div>
<div>不获取这个<p></p></div>
<!-- xpath获取不含有p标签 的div标签
"//div[not(p)]"
 -->

xpath关于tial

from lxml import etree
from html.parser import unescape
source = '''
<div>
  <p>内容111<em>em文本</em>内容222<span>span文本</span>内容333</p>
</div>
'''
# 如果要替换p标签内的文本，但是p标签的子标签内的文本不替换
# 获取p标签内的所有文本
tree = etree.HTML(source)
text_list = tree.xpath("//p/text()")
print(text_list)
# 结果如下
# ['内容111', '内容222', '内容333']
# 但是上面的这些文本的父标签并不是每个都是p标签
for text in text_list:
    print(text, "->的父标签是：", text.getparent())
# 结果
# 内容111 ->的父标签是： <Element p at 0x22e49b4e080>
# 内容222 ->的父标签是： <Element em at 0x22e49b5a600>
# 内容333 ->的父标签是： <Element span at 0x22e49b5a540>
# 实际上只有第一个text的父标签是p标签，其他text是属于em标签或者span标签(p的子标签)的tail
em_list = tree.xpath("//p/em")
print(em_list[0].tail)
span_list = tree.xpath("//p/span")
print(span_list[0].tail)
# 结果
# 内容222
# 内容333
# 这样，当text的父标签是p时，就替换p.text，如果是em/span，就替换em.tail/span.tail
for text in text_list:
    replace_str = "你好老铁"
    parent = text.getparent()
    if text.is_text:
        parent.text = replace_str
    elif text.is_tail:
        parent.tail = replace_str
print(unescape(etree.tostring(tree).decode()))
# 结果，因为etree.HTML会自动补齐<html><body>标签，但是lxml.html.fromstring不会补齐
<html><body><div>
 <p>你好老铁<em>em文本</em>你好老铁<span>span文本</span>你好老铁</p>
</div>
</body></html>

xpath获取当前节点和之后的同级节点

<div>
  <p>
    <span>这个不要获取</span>
  </p>
  <p>
    <span>定位节点</span>
  </p>
  <p>
    <span>这个p要获取1</span>
  </p>
  <p>
    <span>这个p要获取2</span>
  </p>
<div>

1 2	p_list = tree.xpath("//div/p[child::span[text()='定位节点'] or preceding-sibling::p/child::span[text()='定位节点']]") # 获取的节点是最后的三个p标签

xpath把标签转化为字符串（类似scrapy的extract_first()）

1
2
3

tree = etree.HTML(content)
qianzai = tree.xpath('//a[contains(text(),"潜在供应商")]/ancestor::blockquote')[0]
qianzai_str = etree.tostring(qianzai, pretty_print=True, method='html', encoding="utf-8").decode("utf-8").strip()

xpath获取到的和源码不完全一样

from lxml import etree

ss = """
<div class="zhengwen">
<table>
<tr><td>第一行</td></tr>
<td>第二行</td></tr>
<td>第三行</td></tr>
<td>第四行</td></tr>
</table>
</div>
"""

text = ss.replace("</tr> </span></td>", "</tr> <tr>").replace("</tr></span></td>", "</tr><tr>")
tree = etree.HTML(text)
content_ele = tree.xpath("//div[@class='zhengwen']")[0]
content_str = etree.tostring(content_ele, encoding="utf-8").decode("utf-8").strip()
print(content_str)

# 输出结果
<div class="zhengwen">
<table>
<tr><td>第一行</td></tr>
<td>第二行</td>
<td>第三行</td>
<td>第四行</td>
</table>
</div>

# 可以看到输出的结果少了几个</tr>,我认为xpath会将没有成对的</tr>删除，导致和浏览器渲染出来的不一样。
# bs4也是这样

xpath深入

"""
1.把a标签的href替换为 http://www.baidu.com，没有href的增加href="http://www.baidu.com"
2.替换p标签内的文本，em标签不要改
"""

s = """
<p>Select和lxml的Element的区别</p>
<p>
    <a href="http://test.com">这是附件1</a>
    <a>附件222</a>
</p>
<p>要更改的<em>重要</em>文本</p>
"""

用lxml.etree

from lxml import etree
from html import unescape
tree = etree.HTML(s)
# 替换a标签的href
a_list = tree.xpath("//a")
for a in a_list:
    a.set("href", "http://www.baidu.com")

# 删除p标签
p_list = tree.xpath("//p")
tree.remove(p_list[0])

text_list = tree.xpath("//p/text()")
for text in text_list:
    replace_str = "我是p标签的"
    parent = text.getparent()
    if text.is_text:
        parent.text = replace_str
    elif text.is_tail:
        parent.tail = replace_str
print(unescape(etree.tostring(tree).decode()))
# 结果
# <html><body>
# <p>我是p标签的<a href="http://www.baidu.com">这是附件1</a>我是p标签的<a href="http://www.baidu.com">附件222</a>我是p标签的</p>
# <p>我是p标签的<em>重要</em>我是p标签的</p>
# </body></html>

用lxml.html.fromstring

from lxml.html import fromstring
from lxml import etree
from html import unescape
tree = fromstring(s)
# 替换a标签的href
a_list = tree.xpath("//a")
for a in a_list:
    a.set("href", "http://www.baidu.com")

# 删除p标签
p_list = tree.xpath("//p")
tree.remove(p_list[0])

text_list = tree.xpath("//p/text()")
for text in text_list:
    replace_str = "我是p标签的"
    parent = text.getparent()
    if text.is_text:
        parent.text = replace_str
    elif text.is_tail:
        parent.tail = replace_str
print(unescape(etree.tostring(tree).decode()))
# 结果
# <div>
# <p>我是p标签的<a href="http://www.baidu.com">这是附件1</a>我是p标签的<a href="http://www.baidu.com">附件222</a>我是p标签的</p>
# <p>我是p标签的<em>重要</em>我是p标签的</p>
# </div>

用 scrapy.Selector，这个没找到替换文本的办法

from scrapy import Selector
selector = Selector(text=s)
a_list = selector.xpath("//a")
for a in a_list:
    a_attributes = a.root.attrib
    a_attributes["href"] = "http://www.baidu.com"
p_list = selector.xpath("//p")
p_list[0].remove()
print(selector.getall()[0])
# 结果
# <html><body><p>
#     <a href="http://www.baidu.com">这是附件1</a>
#     <a href="http://www.baidu.com">附件222</a>
# </p>
# <p>要更改的<em>重要</em>文本</p></body></html>