python标准库中的urllib模块用来模拟发送http请求。
一、简单请求
1、不传递参数(get请求)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
| import urllib.request
response = urllib.request.urlopen('https://www.baidu.com', timeout=1)
print("查看访问URL地址: \n", response.geturl()) print("查看响应结果(成功返回ok):\n", response.msg) print("查看响应状态码1(http status):\n", response.status) print("查看响应状态码2(http status):\n", response.getcode())
print("查看 response 的返回类型:", type(response))
print("查看头部信息1(http header):\n", response.info()) print("查看头部信息2(http header):\n", response.getheaders())
print("输出头部属性信息:", response.getheader("Server"))
page = response.read()
print("输出网页源码:",page.decode('utf-8'))
|
2、传递参数(post请求)
1 2 3 4 5 6 7 8 9 10 11 12
| import urllib.request, urllib.parse
dict = { 'name':'MIka', 'old:':18 } data = bytes(urllib.parse.urlencode(dict), encoding='utf-8')
response = urllib.request.urlopen('http://httpbin.org/post', data=data)
print(response.read().decode('utf-8'))
|
二、完整请求
对某些网站的请求如果不加上 headers 等信息,就无法正常解析访问网页内容。所以需使用request()来包装请求,再通过urlopen()获取页面。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21
| from urllib import request, parse
url = 'http://httpbin.org/post'
headers = { 'User-Agent':'Mozilla/5.0 (compatible; MSIE 5.5; Windows NT)', 'Host':'httpbin.org', 'Connection': 'keep-alive' }
dict = { 'name':'tom' } data = bytes(parse.urlencode(dict), encoding='utf-8')
req = request.Request(url=url, data=data, headers=headers, method='POST')
response = request.urlopen(req)
print(response.read().decode('utf-8'))
|
三、高级请求
1、密码验证
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
| from urllib.request import HTTPPasswordMgrWithDefaultRealm,HTTPBasicAuthHandler,build_opener from urllib.error import URLError
username = 'username' passowrd = 'password' url = 'http://localhost' p=HTTPPasswordMgrWithDefaultRealm() p.add_password(None,url,username,passowrd) auth_handler=HTTPBasicAuthHandler(p) opener=build_opener(auth_handler)
try: result=opener.open(url) html=result.read().decode('utf-8') print(html) except URLError as e: print(e.reason)
|
2、代理
1 2 3 4 5 6 7 8 9 10 11 12 13
| from urllib.error import URLError from urllib.request import ProxyHandler,build_opener
proxy_handler = ProxyHandler({ 'http':'http://127.0.0.1:8888', 'https':'http://127.0.0.1:9999' }) opener = build_opener(proxy_handler) try: response = opener.open('https://www.baidu.com') print(response.read().decode('utf-8')) except URLError as e: print(e.reason)
|
3、Cookies
保存cookies到本地文件
1 2 3 4 5 6 7 8 9 10 11
| import http.cookiejar,urllib.request
fielname = 'cookies.txt'
cookie = http.cookiejar.MozillaCookieJar(filename=fielname)
handler = urllib.request.HTTPCookieProcessor(cookie) opener = urllib.request.build_opener(handler) response = opener.open('http://www.baidu.com') cookie.save(ignore_discard=True,ignore_expires=True)
|
从文件中读取cookies
1 2 3 4 5 6 7 8 9
| import http.cookiejar,urllib.request
cookie = http.cookiejar.MozillaCookieJar() cookie.load('cookies.txt', ignore_discard=True, ignore_expires=True)
handler = urllib.request.HTTPCookieProcessor(cookie) opener = urllib.request.build_opener(handler) response = opener.open('http://www.baidu.com') print(response.read().decode('utf-8'))
|
注意:有CookieJar、MozillaCookieJar、LWPCookieJar三种cookies格式,保存和加载时用一样的。