import requests from bs4 import BeautifulSoup from pprint import pprint import re def joke(page): url = 'http://xx/?m=vod-index-pg-'+str(page)+'.html' header = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36' } response = requests.get(url, headers=header) html = response.text soup = BeautifulSoup(html, 'lxml') articles = soup.find_all("table")[1] artices = articles.find_all("tr", {"class": {"row"}}) for item in artices: title = item.a.get_text().strip() url = item.a['href'] diqu = item.select('tr td:nth-of-type(2)')[0].get_text().strip() lleibie = item.select('tr td:nth-of-type(3)')[0].get_text().strip() addtime = item.select('tr td:nth-of-type(4)')[0].get_text().strip() #print(title,url,diqu,lleibie,addtime) urla = 'http://caiji.kuyun98.com/'+url responsea = requests.get(urla, headers=header) htmla = responsea.text soupa = BeautifulSoup(htmla, 'lxml') table = soupa.select('table')[1] img = table.select('img')[0].get('src') # 缩略图 tableb = table.find('table') yanyuan = tableb.select('tr')[1].get_text() # 演员 daoyan = tableb.select('tr')[2].get_text() # 影片导演 shanying = tableb.select('tr')[8].get_text() # 上映时间 des = tableb.select('tr')[9].get_text() # 影片简介 vide = table.select('table tr table')[1:] for i in range(0, len(vide)): videtr = vide[i].select('a') for item in range(0, len(videtr)): result = ' '.join(videtr[item]) info =re.findall('http://[^\s]*.*', result) pprint(info) if __name__ == "__main__": for page in range(1, 2): joke(page)