import requests as rq
from bs4 import BeautifulSoup as bs
import sys
import pandas as pd
def gethtml(rooturl, encoding="utf-8"):
# 預設解碼方式utf-8
response = rq.get(rooturl)
response.encoding = encoding
html = response.text
return html # 返回連結的html內容
def getherf(html):
# 使用BeautifulSoup函式解析傳入的html
soup = bs(html, features="lxml")
allnode_of_a = soup.find_all("a")
result = [_.get("href") for _ in allnode_of_a] #取得a標籤中 href的值即超連結
result1 = [ _.text for _ in soup.find_all("a")] #取得a標籤中的文字部分
return result,result1
#返回兩個變數
html = gethtml(sys.argv[1]) #執行程式後的第一個參數即網頁連結
result = getherf(html)[0] #return變數1
result1 = getherf(html)[1] #return變數2
data = {'超連結':result,'顯示名稱':result1}
#爬取結果放入兩欄
df = pd.DataFrame(data).sort_values(['超連結'],ascending=True)
#依超連結欄位排序
writer = pd.ExcelWriter('adm.xlsx')
df.to_excel(writer,sheet_name= "http",index=False)
writer.close()
#執行時python.exe crawler.py 目標頁面連結
沒有留言:
張貼留言