1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
| import random
import requests
import time
from bs4 import BeautifulSoup
def UserAgent_random():
user_agent_list = [
'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1464.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.16 Safari/537.36',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.3319.102 Safari/537.36',
'Mozilla/5.0 (X11; CrOS i686 3912.101.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 '
'Safari/537.36',
'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 '
'Safari/537.36',
'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:17.0) Gecko/20100101 Firefox/17.0.6',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1468.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2224.3 Safari/537.36',
'Mozilla/5.0 (X11; CrOS i686 3912.101.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 '
'Safari/537.36']
UserAgent = {'User-Agent': random.choice(user_agent_list)}
return UserAgent
def next_page(soup):
pager = soup.find(name='div', attrs={'class': 'pager'})
for a in pager.findAll(name='a'):
if a.string == '下一章':
return str(a['href'])
def download_page(soup):
head = '【' + str(soup.h1.string) + '】' + '\n' # 章节名
paragraph.append(head)
content_text = soup.find(name='div', attrs={'class': 'content'})
for i in content_text.findAll(name='p'):
paragraph.append(str(i.string) + '\n')
paragraph.append('\n\n\n\n')
if __name__ == '__main__':
url = 'https://m.gulongsw.com'
url_r = '/xs_968/938982.html'
# final_url = '/xs_968/1008623.html'
from requests.adapters import HTTPAdapter
while url_r != '/xs_968/':
paragraph = []
UserAgent = UserAgent_random()
s = requests.Session()
s.mount('http://', HTTPAdapter(max_retries=3))
s.mount('https://', HTTPAdapter(max_retries=3))
try:
real_html = s.get(url + url_r, headers=UserAgent, timeout=5).text
except requests.exceptions.RequestException as e:
print(e)
soup = BeautifulSoup(real_html, 'html.parser')
download_page(soup)
url_r = next_page(soup)
with open('novel.txt', 'a', encoding='utf-8') as f:
for p in paragraph:
f.write(p)
f.close()
|