2.4.4 解析器使用指南
本指南将详细介绍如何在 Bricks
框架中使用各种解析器,包括在 Response
对象中的便捷方法和高级使用技巧。
Response 对象中的解析方法
Response
对象提供了多个便捷的解析方法,这些方法内部调用相应的解析器,使用起来更加简单。
XPath 解析
xpath(xpath, obj=None, **kwargs)
对响应内容进行 XPath 匹配。
参数说明:
参数名 | 参数类型 | 参数描述 | 默认值 |
---|---|---|---|
xpath | str | XPath 规则 | 必传 |
obj | Optional | 要匹配的对象 | self.html |
**kwargs | dict | 其他关键字参数 | - |
xpath_first(xpath, obj=None, default=None, **kwargs)
返回 XPath 匹配的第一个结果。
示例:
from bricks.lib.response import Response
html_content = """
<html>
<body>
<div class="container">
<h1>新闻标题</h1>
<p class="content">这是新闻内容第一段。</p>
<p class="content">这是新闻内容第二段。</p>
<span class="author">作者:张三</span>
<span class="date">2024-01-15</span>
</div>
</body>
</html>
"""
response = Response(content=html_content)
# 提取标题
title = response.xpath_first("//h1/text()")
print(title) # "新闻标题"
# 提取所有段落
paragraphs = response.xpath("//p[@class='content']/text()")
print(paragraphs) # ["这是新闻内容第一段。", "这是新闻内容第二段。"]
# 提取作者信息
author = response.xpath_first("//span[@class='author']/text()")
print(author) # "作者:张三"
# 提取不存在的元素,使用默认值
missing = response.xpath_first("//div[@class='missing']/text()", default="未找到")
print(missing) # "未找到"
JSON 解析
get(rule, obj=None, strict=True, **kwargs)
使用 JMESPath 语法解析 JSON 数据。
get_first(rule, default=None, obj=None, strict=True, **kwargs)
返回 JSON 匹配的第一个结果。
示例:
from bricks.lib.response import Response
import json
json_data = {
"status": "success",
"data": {
"users": [
{"id": 1, "name": "Alice", "email": "alice@example.com", "active": True},
{"id": 2, "name": "Bob", "email": "bob@example.com", "active": False},
{"id": 3, "name": "Charlie", "email": "charlie@example.com", "active": True}
],
"pagination": {
"page": 1,
"total": 3,
"per_page": 10
}
}
}
response = Response(content=json.dumps(json_data))
# 提取状态
status = response.get("status")
print(status) # "success"
# 提取所有活跃用户
active_users = response.get("data.users[?active]")
print(len(active_users)) # 2
# 提取用户名列表
names = response.get("data.users[*].name")
print(names) # ["Alice", "Bob", "Charlie"]
# 提取第一个用户的邮箱
first_email = response.get_first("data.users[0].email")
print(first_email) # "alice@example.com"
# 提取总页数
total = response.get("data.pagination.total")
print(total) # 3
JSONPath 解析
jsonpath(jpath, obj=None, strict=True, **kwargs)
使用 JSONPath 语法解析 JSON 数据。
jsonpath_first(jpath, obj=None, default=None, strict=True, **kwargs)
返回 JSONPath 匹配的第一个结果。
示例:
from bricks.lib.response import Response
import json
data = {
"store": {
"book": [
{"category": "reference", "author": "Nigel Rees", "title": "Sayings of the Century", "price": 8.95},
{"category": "fiction", "author": "Evelyn Waugh", "title": "Sword of Honour", "price": 12.99},
{"category": "fiction", "author": "Herman Melville", "title": "Moby Dick", "price": 8.99}
],
"bicycle": {"color": "red", "price": 19.95}
}
}
response = Response(content=json.dumps(data))
# 提取所有书籍标题
titles = response.jsonpath("$.store.book[*].title")
print(titles) # ["Sayings of the Century", "Sword of Honour", "Moby Dick"]
# 提取价格小于10的书籍
cheap_books = response.jsonpath("$.store.book[?(@.price < 10)]")
print(len(cheap_books)) # 2
# 提取第一本书的作者
first_author = response.jsonpath_first("$.store.book[0].author")
print(first_author) # "Nigel Rees"
# 提取自行车颜色
bike_color = response.jsonpath_first("$.store.bicycle.color")
print(bike_color) # "red"
正则表达式解析
re(regex, obj=None, **kwargs)
使用正则表达式匹配文本。
re_first(regex, default=None, obj=None, **kwargs)
返回正则表达式匹配的第一个结果。
示例:
from bricks.lib.response import Response
text_content = """
欢迎访问我们的网站!
联系电话:400-123-4567 或 138-8888-9999
邮箱地址:contact@example.com
官方网站:https://www.example.com
备用网站:http://backup.example.org
"""
response = Response(content=text_content)
# 提取所有电话号码
phones = response.re(r"1[3-9]\d-\d{4}-\d{4}|400-\d{3}-\d{4}")
print(phones) # ["400-123-4567", "138-8888-9999"]
# 提取邮箱地址
emails = response.re(r"\w+@\w+\.\w+")
print(emails) # ["contact@example.com"]
# 提取所有URL
urls = response.re(r"https?://[^\s]+")
print(urls) # ["https://www.example.com", "http://backup.example.org"]
# 提取第一个手机号
mobile = response.re_first(r"1[3-9]\d-\d{4}-\d{4}", default="未找到手机号")
print(mobile) # "138-8888-9999"
高级解析技巧
使用 extract 方法进行批量解析
Response.extract()
方法支持使用规则字典进行批量数据提取:
from bricks.lib.response import Response
import json
# 复杂的商品数据
product_data = {
"product": {
"id": "P001",
"name": "智能手机",
"brand": "华为",
"price": {
"current": 2999,
"original": 3999,
"currency": "CNY"
},
"specifications": {
"screen": "6.1英寸",
"storage": "128GB",
"color": ["黑色", "白色", "蓝色"]
},
"reviews": [
{"user": "用户A", "rating": 5, "comment": "非常好用"},
{"user": "用户B", "rating": 4, "comment": "性价比不错"}
],
"in_stock": True
}
}
response = Response(content=json.dumps(product_data))
# 定义提取规则
rules = {
"id": "product.id",
"name": "product.name",
"brand": "product.brand",
"current_price": "product.price.current",
"discount_rate": "round((product.price.original - product.price.current) / product.price.original * 100, 1)",
"colors": "product.specifications.color",
"avg_rating": "avg(product.reviews[*].rating)",
"review_count": "length(product.reviews)",
"available": "product.in_stock"
}
# 批量提取
result = response.extract(engine="json", rules=rules)
print(result)
嵌套数据提取
处理复杂嵌套结构的数据:
from bricks.lib.response import Response
import json
# 嵌套的订单数据
order_data = {
"orders": [
{
"id": "ORD001",
"customer": {"name": "张三", "email": "zhangsan@example.com"},
"items": [
{"name": "商品A", "price": 100, "quantity": 2},
{"name": "商品B", "price": 50, "quantity": 1}
],
"status": "completed"
},
{
"id": "ORD002",
"customer": {"name": "李四", "email": "lisi@example.com"},
"items": [
{"name": "商品C", "price": 200, "quantity": 1}
],
"status": "pending"
}
]
}
response = Response(content=json.dumps(order_data))
# 提取每个订单的详细信息
rules = {
"orders[*]": {
"order_id": "id",
"customer_name": "customer.name",
"customer_email": "customer.email",
"total_amount": "sum(items[*].[price * quantity])",
"item_count": "length(items)",
"status": "status"
}
}
orders_info = response.extract(engine="json", rules=rules)
print(orders_info)
混合解析策略
在实际应用中,可能需要组合使用多种解析方法:
from bricks.lib.response import Response
# 包含HTML和JSON的混合内容
mixed_content = """
<html>
<head>
<title>商品详情页</title>
<script>
var productData = {
"id": "12345",
"name": "智能手表",
"price": 1299,
"specs": {"battery": "7天", "waterproof": "50米"}
};
</script>
</head>
<body>
<h1>智能手表</h1>
<div class="price">¥1299</div>
<div class="description">
<p>续航7天,防水50米</p>
</div>
</body>
</html>
"""
response = Response(content=mixed_content)
# 从HTML中提取基本信息
title = response.xpath_first("//title/text()")
price_text = response.xpath_first("//div[@class='price']/text()")
description = response.xpath_first("//div[@class='description']/p/text()")
print(f"页面标题: {title}")
print(f"价格显示: {price_text}")
print(f"描述: {description}")
# 从JavaScript中提取JSON数据
js_data = response.re_first(r'var productData = ({.*?});', default="{}")
if js_data != "{}":
# 进一步解析JSON
json_response = Response(content=js_data)
product_id = json_response.get("id")
battery_life = json_response.get("specs.battery")
print(f"产品ID: {product_id}")
print(f"电池续航: {battery_life}")
性能优化建议
- 缓存解析结果:
Response
对象会自动缓存text
、html
、json
等属性的计算结果 - 选择合适的解析器:根据数据格式选择最适合的解析器
- 避免重复解析:对于相同的数据,尽量复用解析结果
- 使用具体的选择器:编写精确的 XPath 或 JSONPath 表达式以提高性能
错误处理
from bricks.lib.response import Response
response = Response(content='{"invalid": json}')
# 安全的JSON解析
try:
data = response.json()
result = response.get("some.field")
except Exception as e:
print(f"解析错误: {e}")
result = None
# 使用默认值避免错误
safe_result = response.get_first("some.field", default="默认值")
特殊规则和内置变量
Bricks
框架提供了一些特殊的内置规则,可以在数据提取过程中使用:
内置变量
变量名 | 描述 | 返回值 |
---|---|---|
@index | 当前项的索引 | 数字索引 |
@ts | 当前时间戳 | Unix 时间戳 |
@date | 当前日期时间 | 格式化的日期字符串 |
@unpack | 解包对象 | 展开字典或列表 |
使用示例
from bricks.lib.response import Response
import json
data = {
"products": [
{"name": "商品A", "price": 100},
{"name": "商品B", "price": 200},
{"name": "商品C", "price": 300}
]
}
response = Response(content=json.dumps(data))
# 使用内置变量
rules = {
"products[*]": {
"index": "@index", # 获取索引
"timestamp": "@ts", # 获取时间戳
"date": "@date", # 获取日期
"name": "name",
"price": "price"
}
}
result = response.extract(engine="json", rules=rules)
print(result)
@unpack 的使用
@unpack
可以将对象展开到当前层级:
from bricks.lib.response import Response
import json
data = {
"user": {
"profile": {"name": "张三", "age": 25},
"settings": {"theme": "dark", "language": "zh"}
}
}
response = Response(content=json.dumps(data))
rules = {
"user.profile": {"@unpack": "@"}, # 展开 profile 对象
"user.settings": {"@unpack": "@"} # 展开 settings 对象
}
result = response.extract(engine="json", rules=rules)
print(result) # [{"name": "张三", "age": 25, "theme": "dark", "language": "zh"}]
extract_all 方法
extract_all
方法用于批量处理多个规则,返回生成器:
from bricks.lib.response import Response
import json
data = {"items": [{"id": 1, "name": "A"}, {"id": 2, "name": "B"}]}
response = Response(content=json.dumps(data))
# 定义多个规则
rules_list = [
{"items[0]": {"first_id": "id", "first_name": "name"}},
{"items[1]": {"second_id": "id", "second_name": "name"}},
{"total": "length(items)"}
]
# 批量提取
for result in response.extract_all(engine="json", rules=rules_list):
print(result)
自定义解析引擎
除了内置解析器,还可以使用自定义解析引擎:
from bricks.lib.response import Response
def custom_parser(context):
"""
自定义解析器
:param context: 包含 response, request, rules 的字典
:return: 解析结果
"""
response = context["response"]
rules = context["rules"]
# 自定义解析逻辑
text = response.text
result = {}
for key, pattern in rules.items():
# 这里可以实现任何自定义的解析逻辑
if pattern == "length":
result[key] = len(text)
elif pattern == "upper":
result[key] = text.upper()
else:
result[key] = text
return [result]
# 使用自定义解析器
response = Response(content="Hello World")
rules = {"length": "length", "content": "upper"}
result = response.extract(engine=custom_parser, rules=rules)
print(result) # [{"length": 11, "content": "HELLO WORLD"}]
缓存机制
Response
对象具有智能缓存机制,可以提高性能:
from bricks.lib.response import Response
response = Response(content='{"data": "test"}')
# 第一次调用会解析并缓存
json_data1 = response.json() # 解析并缓存
# 后续调用直接返回缓存结果
json_data2 = response.json() # 直接从缓存返回
json_data3 = response.json() # 直接从缓存返回
# 修改 content 会清空缓存
response.content = '{"data": "new"}'
json_data4 = response.json() # 重新解析并缓存
错误处理最佳实践
from bricks.lib.response import Response
# 1. 检查响应状态
response = Response(content='{"data": "test"}', status_code=200)
if response.ok: # 检查状态码是否在 200-399 范围内
data = response.json()
else:
print(f"请求失败: {response.status_code}")
# 2. 安全的JSON解析
if response.is_json():
data = response.json()
else:
print("响应不是有效的JSON格式")
# 3. 使用默认值避免异常
title = response.xpath_first("//title/text()", default="无标题")
price = response.get_first("product.price", default=0)
# 4. 异常捕获
try:
result = response.extract(engine="json", rules={"data": "complex.path"})
except RuntimeError as e:
print(f"解析错误: {e}")
result = []
性能优化技巧
-
选择合适的解析器:
- HTML/XML 数据使用 XPath
- JSON 数据优先使用 JMESPath (get/get_first)
- 简单文本匹配使用正则表达式
-
利用缓存:
Response
对象会自动缓存text
、html
、json
属性- 避免重复调用相同的解析方法
-
精确的选择器:
- 使用具体的 XPath 路径而不是通配符
- 编写高效的 JMESPath 表达式
-
批量处理:
- 使用
extract
方法的规则字典进行批量提取 - 避免多次调用单个解析方法
- 使用
通过合理使用这些解析方法和技巧,可以高效地从各种格式的数据中提取所需信息,构建强大的数据处理流程。