1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132
| import requests import random from bs4 import BeautifulSoup import sqlite3 import os
class basicsplider: def __init__(self): db_path = 'fang_data.sqlite' if not os.path.exists(db_path): conn = sqlite3.connect(db_path) c = conn.cursor() c.execute('''CREATE TABLE ershoufang ( fang_id INTEGER PRIMARY KEY, fang_title CHAR(100) , fang_flood CHAR(50) , fang_layout CHAR(50) , fang_area CHAR(50) , fang_orientation CHAR(50) , fang_built CHAR(50) , fang_total CHAR(50) , fang_unit CHAR(50) , fang_followers CHAR(50) , fang_publish CHAR(50) ); ''') conn.commit() conn.close() print('数据库创建成功') self.conn = sqlite3.connect(db_path) self.c = self.conn.cursor() def close_connect(self): self.conn.close()
def get_headers(self): agent_list = [ 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER', 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36' ] user_agent = random.choice(agent_list) headers = {'User-Agent': user_agent, 'Accept-Language':'zh-cn,zh;q=0.5'} return headers
def get_proxies(self): http_proxies = [ {'http:':'183.166.139.42:9999'}, {'http:':'183.166.70.83:9999'}, {'http:':'171.35.174.28:9999'} ] proxies = random.choice(http_proxies) return proxies
def get_data(self, url): headers = self.get_headers() proxies = self.get_proxies() try: response = requests.get(url, headers=headers, proxies=proxies) status_code = response.status_code data = response.text.encode('utf-8') soup = BeautifulSoup(data, 'lxml') return soup, status_code except Exception as e: print(str(e)) return None, None
def extraction(self, soup): fang_all = soup.find_all('div', class_="info clear") for fang in fang_all: try: fang_title = fang.find('div',class_="title").a.text.strip() fang_flood = fang.find('div',class_="flood").a.text.strip() fang_address = fang.find('div',class_="address").div.text.strip() fang_layout = fang_address.split('|')[0].strip() fang_area = fang_address.split('|')[1].strip() fang_orientation = fang_address.split('|')[2].strip() fang_built = fang_address.split('|')[5].strip() if fang_built[-1:] != '建': fang_built = ''
fang_priceinfo = fang.find('div',class_="priceInfo") fang_total = fang_priceinfo.find('div',class_="totalPrice").text.strip() fang_unit = fang_priceinfo.find('div',class_="unitPrice").span.text.strip()
fang_followinfo = fang.find('div',class_="followInfo").text fang_followers = fang_followinfo.split('/')[0].strip() fang_publish = fang_followinfo.split('/')[1].strip() self.save(fang_title,fang_flood,fang_layout,fang_area,fang_orientation,fang_built,fang_unit,fang_total,fang_followers,fang_publish) except Exception as e: print(str(e))
def save(self,fang_title,fang_flood,fang_layout,fang_area,fang_orientation,fang_built,fang_followers,fang_publish,fang_unit,fang_total): self.c.execute("INSERT INTO ershoufang (fang_title,fang_flood,fang_layout,fang_area,fang_orientation,fang_built,fang_unit,fang_total,fang_followers,fang_publish)\ VALUES ('"+fang_title+"','" +fang_flood+"','" +fang_layout+"','" +fang_area+"','" +fang_orientation+"','" +fang_built+"','" +fang_unit+"','" +fang_total+"','" +fang_followers+"','" +fang_publish+"');" ) self.conn.commit()
def run(self, url): soup, status_code = self.get_data(url) if status_code == 200: self.extraction(soup)
if __name__ == '__main__': splider = basicsplider()
for i in range(1, 5): url = 'https://bj.lianjia.com/ershoufang/chaoyang/pg'+str(i)+'/' splider.run(url) splider.close_connect() print('抓取完成!')
|