python爬虫伪装用户访问网站,并抓取该网站的标题、关键字和网站介绍代码
import requests from bs4 import BeautifulSoup import chardet from requests.packages.urllib3.exceptions import InsecureRequestWarning requests.packages.urllib3.disable_warnings(InsecureRequestWarning) def fetch_content(url): try: headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36" } response = requests.get(url,headers=headers,verify=False) response.encoding = chardet.detect(response.content)["encoding"] return response.text except requests.exceptions.RequestException as e: print(e) return None def parse_html(html): soup = BeautifulSoup(html, "html.parser") title = soup.title.string if soup.title else '' meta_keywords = soup.find("meta", attrs={"name": "keywords"}) keywords = meta_keywords["content"] if meta_keywords and "content" in meta_keywords.attrs else '' meta_description = soup.find("meta", attrs={"name": "description"}) description = meta_description["content"] if meta_description and "content" in meta_description.attrs else '' return title, keywords, description def main(): url = "https://www.qq.com" html = fetch_content(url) if html: title, keywords, description = parse_html(html) print("标题:", title) print("关键字:", keywords) print("简介:", description) if __name__ == "__main__": main()
版权声明:
1、本文系转载,版权归原作者所有,旨在传递信息,不代表看本站的观点和立场。
2、本站仅提供信息发布平台,不承担相关法律责任。
3、若侵犯您的版权或隐私,请联系本站管理员删除。
4、文章来源:来自于网络收集。