python标准库之互联网访问

发表于 2019-03-01 更新于 2021-11-23

python标准库中的urllib模块用来模拟发送http请求。

一、简单请求

1、不传递参数（get请求）

import urllib.request

response = urllib.request.urlopen('https://www.baidu.com', timeout=1)  # 设置超时时间, 单位为秒

print("查看访问URL地址: \n", response.geturl())
print("查看响应结果(成功返回ok)：\n", response.msg)
print("查看响应状态码1(http status)：\n", response.status)
print("查看响应状态码2(http status)：\n", response.getcode())

print("查看 response 的返回类型：", type(response))

print("查看头部信息1(http header)：\n", response.info())
print("查看头部信息2(http header)：\n", response.getheaders())

print("输出头部属性信息：", response.getheader("Server"))

page = response.read()

print("输出网页源码:",page.decode('utf-8'))

2、传递参数（post请求）

import urllib.request, urllib.parse

# data需要字节类型的参数，使用bytes()函数转换为字节，使用urllib.parse模块里的urlencode()方法来将参数字典转换为字符串并指定编码
dict = {
    'name':'MIka',
    'old:':18
}
data = bytes(urllib.parse.urlencode(dict), encoding='utf-8') 

response = urllib.request.urlopen('http://httpbin.org/post', data=data)

print(response.read().decode('utf-8'))

二、完整请求

对某些网站的请求如果不加上 headers 等信息，就无法正常解析访问网页内容。所以需使用request()来包装请求，再通过urlopen()获取页面。

from urllib import request, parse

# 访问地址
url = 'http://httpbin.org/post'
# 定义头信息
headers = {
    'User-Agent':'Mozilla/5.0 (compatible; MSIE 5.5; Windows NT)',
    'Host':'httpbin.org',
    'Connection': 'keep-alive' # 表示连接状态, 记录Session的状态 (可省略)
}
# post请求参数字典
dict = {
    'name':'tom'
}
data = bytes(parse.urlencode(dict), encoding='utf-8')

req = request.Request(url=url, data=data, headers=headers, method='POST')

response = request.urlopen(req) 

print(response.read().decode('utf-8'))

三、高级请求

1、密码验证

from urllib.request import HTTPPasswordMgrWithDefaultRealm,HTTPBasicAuthHandler,build_opener
from urllib.error import URLError

username = 'username'
passowrd = 'password'
url = 'http://localhost'
p=HTTPPasswordMgrWithDefaultRealm() # 构造密码管理实例
p.add_password(None,url,username,passowrd) # 添加用户名和密码到实例中
auth_handler=HTTPBasicAuthHandler(p) # 传递密码管理实例构建一个验证实例
opener=build_opener(auth_handler)  # 构建一个Opener

try:
    result=opener.open(url)  # 打开链接，完成验证，返回的结果是验证后的页面内容
    html=result.read().decode('utf-8')
    print(html)
except URLError as e:
    print(e.reason)

2、代理

from urllib.error import URLError
from urllib.request import ProxyHandler,build_opener

proxy_handler = ProxyHandler({
    'http':'http://127.0.0.1:8888',
    'https':'http://127.0.0.1:9999'
})
opener = build_opener(proxy_handler) #构造一个Opener
try:
    response = opener.open('https://www.baidu.com')
    print(response.read().decode('utf-8'))
except URLError as e:
    print(e.reason)

3、Cookies

保存cookies到本地文件

import http.cookiejar,urllib.request

fielname = 'cookies.txt'

# cookie=http.cookiejar.CookieJar() # 实例化cookiejar对象
cookie = http.cookiejar.MozillaCookieJar(filename=fielname) # 实例化cookiejar对象, 保存浏览器类型的Mozilla的cookie格式

handler = urllib.request.HTTPCookieProcessor(cookie) #构建一个handler
opener = urllib.request.build_opener(handler) #构建Opener
response = opener.open('http://www.baidu.com') #请求
cookie.save(ignore_discard=True,ignore_expires=True)

从文件中读取cookies

import http.cookiejar,urllib.request

cookie = http.cookiejar.MozillaCookieJar()
cookie.load('cookies.txt', ignore_discard=True, ignore_expires=True)

handler = urllib.request.HTTPCookieProcessor(cookie) #构建一个handler
opener = urllib.request.build_opener(handler) #构建Opener
response = opener.open('http://www.baidu.com') #请求
print(response.read().decode('utf-8'))

注意：有CookieJar、MozillaCookieJar、LWPCookieJar三种cookies格式，保存和加载时用一样的。