Python Requests库完全指南
概述
Requests是Python中最流行的HTTP库,由Kenneth Reitz开发。它提供了简洁、人性化的API,使得发送HTTP请求变得极其简单。相比Python内置的urllib模块,Requests更加直观和易用。
安装
# 使用pip安装
pip install requests
# 使用conda安装
conda install requests
# 使用poetry添加到项目
poetry add requests
# 使用uv安装
uv add requests
基本用法
1. 导入库
import requests
2. 发送GET请求
# 基本GET请求
response = requests.get('https://api.github.com')
# 带参数的GET请求
response = requests.get('https://api.github.com/users/python')
# 带查询参数
params = {'key1': 'value1', 'key2': 'value2'}
response = requests.get('https://httpbin.org/get', params=params)
# 检查响应状态
if response.status_code == 200:
print("请求成功")
print(response.text) # 响应文本
else:
print(f"请求失败,状态码: {response.status_code}")
3. 发送POST请求
# 发送JSON数据
data = {'name': 'John', 'age': 30}
response = requests.post('https://httpbin.org/post', json=data)
# 发送表单数据
form_data = {'username': 'user', 'password': 'pass'}
response = requests.post('https://httpbin.org/post', data=form_data)
# 发送原始数据
raw_data = 'raw string data'
response = requests.post('https://httpbin.org/post', data=raw_data)
4. 其他HTTP方法
# PUT请求
response = requests.put('https://httpbin.org/put', json={'key': 'value'})
# DELETE请求
response = requests.delete('https://httpbin.org/delete')
# HEAD请求
response = requests.head('https://httpbin.org/get')
# OPTIONS请求
response = requests.options('https://httpbin.org/get')
# PATCH请求
response = requests.patch('https://httpbin.org/patch', json={'key': 'value'})
响应处理
1. 响应内容
response = requests.get('https://api.github.com')
# 文本内容
text = response.text
print(text)
# JSON内容(自动解析)
json_data = response.json()
print(json_data)
# 二进制内容
binary_content = response.content
print(binary_content)
# 原始响应
raw_response = response.raw
print(raw_response)
2. 响应状态码
response = requests.get('https://httpbin.org/status/200')
# 状态码
status_code = response.status_code
print(status_code)
# 状态码含义
response.raise_for_status() # 如果状态码不是200,抛出异常
# 常用状态码判断
if response.ok: # 200-299
print("请求成功")
elif response.status_code == 404:
print("资源未找到")
elif response.status_code == 500:
print("服务器错误")
3. 响应头
response = requests.get('https://httpbin.org/get')
# 获取所有响应头
headers = response.headers
print(headers)
# 获取特定响应头
content_type = response.headers['Content-Type']
print(content_type)
# 响应头不区分大小写
content_type = response.headers.get('content-type')
print(content_type)
4. 响应编码
response = requests.get('https://httpbin.org/encoding/utf8')
# 查看编码
encoding = response.encoding
print(encoding)
# 设置编码
response.encoding = 'utf-8'
# 自动检测编码(推荐)
response.encoding = response.apparent_encoding
请求配置
1. 请求头
# 设置请求头
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
'Accept': 'application/json',
'Authorization': 'Bearer your_token_here'
}
response = requests.get('https://httpbin.org/headers', headers=headers)
2. 超时设置
# 设置超时(秒)
response = requests.get('https://httpbin.org/delay/5', timeout=3)
# 分别设置连接超时和读取超时
response = requests.get('https://httpbin.org/delay/5', timeout=(3.05, 10))
# 永久超时(不推荐)
# response = requests.get('https://httpbin.org/delay/5', timeout=None)
3. 参数传递
# URL参数
params = {
'key1': 'value1',
'key2': 'value2',
'list': ['item1', 'item2'] # 会被编码为 key2=item1&key2=item2
}
response = requests.get('https://httpbin.org/get', params=params)
# 查看实际请求的URL
print(response.url)
4. 代理设置
# HTTP代理
proxies = {
'http': 'http://proxy.example.com:8080',
'https': 'https://proxy.example.com:8080'
}
# 带认证的代理
proxies = {
'http': 'http://user:password@proxy.example.com:8080',
'https': 'https://user:password@proxy.example.com:8080'
}
response = requests.get('https://httpbin.org/ip', proxies=proxies)
会话管理
1. Session对象
# 创建会话对象
session = requests.Session()
# 设置会话级别的配置
session.headers.update({'User-Agent': 'MyApp/1.0'})
session.auth = ('username', 'password')
# 使用会话发送请求
response1 = session.get('https://httpbin.org/cookies/set/session_id/123')
response2 = session.get('https://httpbin.org/cookies')
# 会话会自动保持cookie
print(response2.json())
# 关闭会话
session.close()
2. Cookie处理
# 手动设置cookie
cookies = {'session_id': '123456'}
response = requests.get('https://httpbin.org/cookies', cookies=cookies)
# 从响应中获取cookie
response = requests.get('https://httpbin.org/cookies/set/test/value')
cookies = response.cookies
print(cookies.get_dict())
# 使用RequestsCookieJar
jar = requests.cookies.RequestsCookieJar()
jar.set('cookie1', 'value1', domain='httpbin.org')
jar.set('cookie2', 'value2', domain='httpbin.org')
response = requests.get('https://httpbin.org/cookies', cookies=jar)
认证
1. 基本认证
from requests.auth import HTTPBasicAuth
# 基本认证
response = requests.get('https://httpbin.org/basic-auth/user/pass',
auth=HTTPBasicAuth('user', 'pass'))
# 简化写法
response = requests.get('https://httpbin.org/basic-auth/user/pass',
auth=('user', 'pass'))
2. 摘要认证
from requests.auth import HTTPDigestAuth
response = requests.get('https://httpbin.org/digest-auth/auth/user/pass',
auth=HTTPDigestAuth('user', 'pass'))
3. OAuth认证
from requests_oauthlib import OAuth1
# OAuth 1.0
auth = OAuth1('client_key', 'client_secret',
'resource_owner_key', 'resource_owner_secret')
response = requests.get('https://api.twitter.com/1.1/account/verify_credentials.json', auth=auth)
错误处理
1. 异常类型
import requests
from requests.exceptions import RequestException, Timeout, ConnectionError, HTTPError
try:
response = requests.get('https://httpbin.org/status/404', timeout=5)
response.raise_for_status() # 检查HTTP错误
except Timeout:
print("请求超时")
except ConnectionError:
print("连接错误")
except HTTPError as e:
print(f"HTTP错误: {e}")
except RequestException as e:
print(f"请求异常: {e}")
except Exception as e:
print(f"未知错误: {e}")
2. 重试机制
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
# 创建重试策略
retry_strategy = Retry(
total=3, # 总重试次数
backoff_factor=1, # 重试间隔
status_forcelist=[429, 500, 502, 503, 504], # 需要重试的状态码
)
# 创建适配器
adapter = HTTPAdapter(max_retries=retry_strategy)
# 创建会话并设置适配器
session = requests.Session()
session.mount("http://", adapter)
session.mount("https://", adapter)
# 使用会话发送请求
response = session.get('https://httpbin.org/status/500')
文件上传下载
1. 文件下载
# 下载小文件
response = requests.get('https://httpbin.org/bytes/1024')
with open('downloaded_file.bin', 'wb') as f:
f.write(response.content)
# 流式下载大文件
response = requests.get('https://httpbin.org/stream-bytes/1024', stream=True)
with open('large_file.bin', 'wb') as f:
for chunk in response.iter_content(chunk_size=8192):
if chunk: # 过滤掉保持连接的新块
f.write(chunk)
# 带进度的下载
import tqdm
response = requests.get('https://httpbin.org/stream-bytes/1024', stream=True)
total_size = int(response.headers.get('content-length', 0))
with open('progress_file.bin', 'wb') as f, tqdm.tqdm(
desc='下载中',
total=total_size,
unit='iB',
unit_scale=True,
unit_divisor=1024,
) as bar:
for data in response.iter_content(chunk_size=1024):
size = f.write(data)
bar.update(size)
2. 文件上传
# 上传单个文件
files = {'file': open('example.txt', 'rb')}
response = requests.post('https://httpbin.org/post', files=files)
# 上传多个文件
files = [
('file1', open('file1.txt', 'rb')),
('file2', open('file2.txt', 'rb'))
]
response = requests.post('https://httpbin.org/post', files=files)
# 上传文件并添加额外字段
files = {'file': open('example.txt', 'rb')}
data = {'description': '这是一个示例文件'}
response = requests.post('https://httpbin.org/post', files=files, data=data)
# 流式上传大文件
def generate_file():
with open('large_file.txt', 'rb') as f:
while True:
chunk = f.read(8192)
if not chunk:
break
yield chunk
response = requests.post('https://httpbin.org/post',
data=generate_file(),
headers={'Content-Type': 'application/octet-stream'})
高级用法
1. 自定义适配器
from requests.adapters import HTTPAdapter
from urllib3.poolmanager import PoolManager
class TLSAdapter(HTTPAdapter):
def init_poolmanager(self, *args, **kwargs):
context = ssl.create_default_context()
context.check_hostname = False
context.verify_mode = ssl.CERT_NONE
kwargs['ssl_context'] = context
return super().init_poolmanager(*args, **kwargs)
import ssl
session = requests.Session()
session.mount('https://', TLSAdapter())
2. 事件钩子
def print_url(response, *args, **kwargs):
print(f"请求URL: {response.url}")
def print_response(response, *args, **kwargs):
print(f"响应状态码: {response.status_code}")
# 注册钩子
session = requests.Session()
session.hooks['response'].append(print_response)
session.hooks['request'].append(print_url)
response = session.get('https://httpbin.org/get')
3. 自定义认证
from requests.auth import AuthBase
class TokenAuth(AuthBase):
def __init__(self, token):
self.token = token
def __call__(self, r):
r.headers['Authorization'] = f'Bearer {self.token}'
return r
# 使用自定义认证
auth = TokenAuth('your_token_here')
response = requests.get('https://httpbin.org/headers', auth=auth)
性能优化
1. 连接池
# 配置连接池大小
session = requests.Session()
adapter = HTTPAdapter(
pool_connections=10, # 连接池大小
pool_maxsize=100, # 最大连接数
max_retries=3
)
session.mount('http://', adapter)
session.mount('https://', adapter)
2. 压缩支持
# 自动处理压缩
headers = {'Accept-Encoding': 'gzip, deflate'}
response = requests.get('https://httpbin.org/gzip', headers=headers)
# 检查是否使用了压缩
print(response.headers.get('content-encoding'))
3. Keep-Alive
# Session对象自动使用Keep-Alive
session = requests.Session()
# 发送多个请求,复用连接
for i in range(5):
response = session.get('https://httpbin.org/get')
print(f"请求 {i+1} 完成")
实际应用示例
1. API客户端封装
class APIClient:
def __init__(self, base_url, api_key=None):
self.base_url = base_url.rstrip('/')
self.session = requests.Session()
if api_key:
self.session.headers.update({'Authorization': f'Bearer {api_key}'})
self.session.headers.update({
'Content-Type': 'application/json',
'User-Agent': 'MyAPIClient/1.0'
})
def request(self, method, endpoint, **kwargs):
url = f"{self.base_url}/{endpoint.lstrip('/')}"
response = self.session.request(method, url, **kwargs)
response.raise_for_status()
return response.json()
def get(self, endpoint, params=None):
return self.request('GET', endpoint, params=params)
def post(self, endpoint, data=None):
return self.request('POST', endpoint, json=data)
def put(self, endpoint, data=None):
return self.request('PUT', endpoint, json=data)
def delete(self, endpoint):
return self.request('DELETE', endpoint)
# 使用示例
client = APIClient('https://api.example.com', api_key='your_key')
user_data = client.get('/users/1')
new_user = client.post('/users', {'name': 'John', 'email': 'john@example.com'})
2. 网页爬虫示例
import requests
from bs4 import BeautifulSoup
import time
import random
class WebScraper:
def __init__(self):
self.session = requests.Session()
# 设置随机User-Agent
user_agents = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36'
]
self.session.headers.update({
'User-Agent': random.choice(user_agents),
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate',
'Connection': 'keep-alive',
})
def get_page(self, url, retries=3):
for attempt in range(retries):
try:
response = self.session.get(url, timeout=10)
response.raise_for_status()
return response
except Exception as e:
if attempt == retries - 1:
raise
time.sleep(random.uniform(1, 3))
def parse_page(self, url):
response = self.get_page(url)
soup = BeautifulSoup(response.text, 'html.parser')
return soup
def scrape_links(self, url):
soup = self.parse_page(url)
links = []
for link in soup.find_all('a', href=True):
href = link['href']
if href.startswith('http'):
links.append(href)
return links
# 使用示例
scraper = WebScraper()
links = scraper.scrape_links('https://example.com')
print(links)
最佳实践
1. 安全性
# 验证SSL证书(默认开启)
response = requests.get('https://example.com', verify=True)
# 禁用SSL验证(仅用于测试)
response = requests.get('https://example.com', verify=False)
# 使用自定义CA证书
response = requests.get('https://example.com', verify='/path/to/certfile')
# 敏感信息不要硬编码
import os
api_key = os.getenv('API_KEY')
response = requests.get('https://api.example.com',
headers={'Authorization': f'Bearer {api_key}'})
2. 资源管理
# 使用上下文管理器
with requests.Session() as session:
response = session.get('https://httpbin.org/get')
print(response.json())
# 确保文件正确关闭
files = {'file': open('example.txt', 'rb')}
try:
response = requests.post('https://httpbin.org/post', files=files)
finally:
files['file'].close()
3. 日志记录
import logging
# 启用Requests日志
logging.basicConfig(level=logging.DEBUG)
# 或者只记录特定级别
logging.getLogger("urllib3.connectionpool").setLevel(logging.WARNING)
# 自定义日志记录
import requests
import logging
logger = logging.getLogger(__name__)
def make_request(url, **kwargs):
logger.info(f"发送请求到: {url}")
try:
response = requests.get(url, **kwargs)
logger.info(f"响应状态码: {response.status_code}")
return response
except Exception as e:
logger.error(f"请求失败: {e}")
raise
常见问题与解决方案
1. 请求超时
# 设置合理的超时时间
try:
response = requests.get('https://example.com', timeout=(3.05, 30))
except requests.Timeout:
print("请求超时,请检查网络连接或增加超时时间")
2. 内存使用
# 处理大响应时使用流式传输
response = requests.get('https://example.com/large-file', stream=True)
with open('output.txt', 'wb') as f:
for chunk in response.iter_content(chunk_size=8192):
f.write(chunk)
3. 编码问题
# 处理编码问题
response = requests.get('https://example.com')
if response.encoding == 'ISO-8859-1':
response.encoding = response.apparent_encoding
总结
Requests库是Python中最优秀的HTTP客户端库之一,它提供了:
- 简洁的API:人性化的接口设计
- 丰富的功能:支持所有HTTP方法和特性
- 良好的扩展性:插件化的架构
- 完善的文档:详细的官方文档和社区支持
- 广泛的采用:在Python生态系统中被广泛使用
掌握Requests库对于进行网络编程、API调用、网页爬虫等任务都是必不可少的技能。通过合理使用Requests的各种功能,可以构建出高效、稳定的网络应用。
13 0
评论 (0)
请先登录后再评论
暂无评论