爬蟲程式 crawler.py (爬取網頁中所有的超連結)

import requests as rq

from bs4 import BeautifulSoup as bs

import sys

import pandas as pd

def gethtml(rooturl, encoding="utf-8"):

# 預設解碼方式utf-8

response = rq.get(rooturl)

response.encoding = encoding

html = response.text

return html # 返回連結的html內容

def getherf(html):

# 使用BeautifulSoup函式解析傳入的html

soup = bs(html, features="lxml")

allnode_of_a = soup.find_all("a")

result = [_.get("href") for _ in allnode_of_a] #取得a標籤中 href的值即超連結

result1 = [ _.text for _ in soup.find_all("a")] #取得a標籤中的文字部分

return result,result1

#返回兩個變數

html = gethtml(sys.argv[1]) #執行程式後的第一個參數即網頁連結

result = getherf(html)[0] #return變數1

result1 = getherf(html)[1] #return變數2

data = {'超連結':result,'顯示名稱':result1}

#爬取結果放入兩欄

df = pd.DataFrame(data).sort_values(['超連結'],ascending=True)

#依超連結欄位排序

writer = pd.ExcelWriter('adm.xlsx')

df.to_excel(writer,sheet_name= "http",index=False)

writer.close()

#執行時python.exe crawler.py 目標頁面連結

熱門文章