from requests.exceptions import Timeout
・・・
# Get the HTML content of the top page
try:
response = requests.get(url, timeout=(6.0, 10.0))
html = response.content.decode("utf-8")
except Timeout:
print(f"\n[Timeout {url}]")
% bandit test.py
[main] INFO profile include tests: None
[main] INFO profile exclude tests: None
[main] INFO cli include tests: None
[main] INFO cli exclude tests: None
[main] INFO running on Python 3.8.3
[node_visitor] WARNING Unable to find qualified name for module: test.py
Run started:2023-08-14 08:04:48.814175
Test results:
No issues identified.
Code scanned:
Total lines of code: 53
Total lines skipped (#nosec): 0
Run metrics:
Total issues (by severity):
Undefined: 0
Low: 0
Medium: 0
High: 0
Total issues (by confidence):
Undefined: 0
Low: 0
Medium: 0
High: 0
Files skipped (0):
%
% bandit test.py
[main] INFO profile include tests: None
[main] INFO profile exclude tests: None
[main] INFO cli include tests: None
[main] INFO cli exclude tests: None
[main] INFO running on Python 3.8.3
[node_visitor] WARNING Unable to find qualified name for module: test.py
Run started:2023-08-14 01:37:45.050500
Test results:
>> Issue: [B113:request_without_timeout] Requests call without timeout
Severity: Medium Confidence: Low
CWE: CWE-400 (https://cwe.mitre.org/data/definitions/400.html)
More Info: https://bandit.readthedocs.io/en/1.7.5/plugins/b113_request_without_timeout.html
Location: test.py:31:11
30 # Get the HTML content of the top page
31 response = requests.get(url)
32 html = response.content.decode("utf-8")
--------------------------------------------------
>> Issue: [B113:request_without_timeout] Requests call without timeout
Severity: Medium Confidence: Low
CWE: CWE-400 (https://cwe.mitre.org/data/definitions/400.html)
More Info: https://bandit.readthedocs.io/en/1.7.5/plugins/b113_request_without_timeout.html
Location: test.py:62:15
61
62 response = requests.get(link)
63 html = response.content.decode("utf-8")
--------------------------------------------------
Code scanned:
Total lines of code: 46
Total lines skipped (#nosec): 0
Run metrics:
Total issues (by severity):
Undefined: 0
Low: 0
Medium: 2
High: 0
Total issues (by confidence):
Undefined: 0
Low: 2
Medium: 0
High: 0
Files skipped (0):
%
3.bandit 実行結果の考察
どうやら「Test results:」という部分にプログラムの問題点が列挙されているようです。
2つ「Issue」が記載されていますが、どちらも Requests call without timeout となっています。
タイトルの通り、何かのhtml タグではなく、ブラウザに表示される文字列のスタート部分とエンド部分を指定して html データを抜き出してみました。
1.Pythonプログラム
# -*- coding: utf-8 -*-
from datetime import datetime, timedelta
import requests
import re
from bs4 import BeautifulSoup
# Set the day before yesterday's date
beforeyesterday = datetime.now() - timedelta(days=17)
beforeyesterday_str = beforeyesterday.strftime("%Y/%m/%d")
# URL of the top page
url = "https://www.xxx.jp/"・・・スクレイピングサイト"/"付き
domain ="https://www.xxx.jp"・・・スクレイピングサイト"/"無し
# start_string と end_string の間のストリングを取得する
start_string = "○○の一例:"
end_string = "○○○した場合は、○○○までご連絡ください。"
# Get the HTML content of the top page
response = requests.get(url)
html = response.content.decode("utf-8")
# Use BeautifulSoup to parse the HTML content
soup = BeautifulSoup(html, "html.parser")
# Find the parent element with the "○○○情報" title
security_info = soup.find('h1', string="○○○情報").parent
#print(security_info)
# Find all <h3> tags within the parent element
h3_tags = security_info.find_all('h3')
filtered_links = [tag.a['href'] for tag in h3_tags if tag.a is not None and beforeyesterday_str in tag.a.text]
for link in filtered_links:
if link.startswith('http'):
print(link)
else:
link = domain + link
print(link)
# トップページから取得した指定日の記事を読み込む
response = requests.get(link)
html = response.content.decode("utf-8")
# BeautifulSoupを使ってHTMLを解析
# soup = BeautifulSoup(html, 'html.parser')
# article = soup.find('article', class_="nc-content-list")
# print(article)
# texts_p = [c.get_text() for c in article.find_all('p')]
# print(texts_p)
# Split the target text on the start and end strings and take the middle part
target_text = html.split(start_string)[1].split(end_string)[0].split("</p>")[0]
print(target_text)
# 改行コードを空文字列に置換して一つのテキストにする
target_text = target_text.replace('\n', '')
# <br />タグを区切り文字として順番に配列に入れる
result_array = [text for text in target_text.split('<br />') if text]
# 結果の出力
print(result_array)
33行目:filtered_links = [tag.a[‘href’] for tag in h3_tags if tag.a is not None and beforeyesterday_str in tag.a.text] ここやたら長いですが、結果的には h3_tagsからbeforeyesterday_strが含まれるhrefのリンクデータのみ抜き出し、filtered_linksに入れてくれているようです。
# -*- coding: utf-8 -*-
from datetime import datetime, timedelta
import requests
import re
from bs4 import BeautifulSoup
# Set the day before yesterday's date
beforeyesterday = datetime.now() - timedelta(days=15)
beforeyesterday_str = beforeyesterday.strftime("%Y%m%d")
mail_line_pattern = "From: \"[a-zA-Z0-9_.+-]+.+[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+\""
mail_pattern = "^[0-9a-zA-Z_.+-]+@[0-9a-zA-Z-]+\.[0-9a-zA-Z-.]+$"
env_mail_pattern = "<+[0-9a-zA-Z_.+-]+@[0-9a-zA-Z-]+\.[0-9a-zA-Z-.]+>"
subject_line_pattern = "Subject:"
# Initialize an empty list to store the articles
articles_beforeyesterday = []
text_beforeyesterday = []
link_beforeyesterday = []
mail_list = []
email_list = []
env_email_list = []
subject_list = []
title_list = []
# URL of the top page
url = "https://www.xxx.jp/news/"
domain ="http://www.xxx.jp"
# Get the HTML content of the top page
response = requests.get(url)
html = response.content.decode("utf-8")
# Use BeautifulSoup to parse the HTML content
soup = BeautifulSoup(html, "html.parser")
# Find all <a href> elements in the HTML
for a in soup.find_all("a",href=re.compile(beforeyesterday_str)):
if a in articles_beforeyesterday:
print("duplicated")
else:
text=a.getText()
link=a.get("href")
articles_beforeyesterday.append(a)
text_beforeyesterday.append(text)
link_beforeyesterday.append(link)
print(link_beforeyesterday)
for link in link_beforeyesterday:
# Get the HTML content of the top page
if link.startswith('http'):
print(link)
else:
link = domain + link
print(link)
response = requests.get(link)
html = response.content.decode("utf-8")
# Use BeautifulSoup to parse the HTML content
soup = BeautifulSoup(html, "html.parser")
h5_tags = soup.find_all('h5', class_='alert_h5')
for tag in h5_tags:
if tag.get_text() == '○○○の件名':
print(tag.find_next('p').get_text())
# -*- coding: utf-8 -*-
from datetime import datetime, timedelta
import requests
import re
from bs4 import BeautifulSoup
# Set the day before yesterday's date
beforeyesterday = datetime.now() - timedelta(days=2)
beforeyesterday_str = beforeyesterday.strftime("%Y%m%d")
# mail_line_pattern = "From: \"[a-zA-Z0-9_.+-]+.+[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+\""
mail_line_pattern = "From:"
mail_pattern = "^[0-9a-zA-Z_.+-]+@[0-9a-zA-Z-]+\.[0-9a-zA-Z-.]+$"
env_mail_pattern = "<+[0-9a-zA-Z_.+-]+@[0-9a-zA-Z-]+\.[0-9a-zA-Z-.]+>"
subject_line_pattern = "Subject:"
# Initialize an empty list to store the articles
articles_beforeyesterday = []
text_beforeyesterday = []
link_beforeyesterday = []
mail_list = []
email_list = []
env_email_list = []
subject_list = []
title_list = []
# URL of the top page
url = "https://www.xxx.jp/news/"・・・スクレイピングするサイトのURL
# Get the HTML content of the top page
response = requests.get(url)
html = response.content.decode("utf-8")
# Use BeautifulSoup to parse the HTML content
soup = BeautifulSoup(html, "html.parser")
# Find all <a href> elements in the HTML
for a in soup.find_all("a",href=re.compile(beforeyesterday_str)):
if a in articles_beforeyesterday:
print("duplicated")
else:
text=a.getText()
link=a.get("href")
articles_beforeyesterday.append(a)
text_beforeyesterday.append(text)
link_beforeyesterday.append(link)
print(articles_beforeyesterday)
for link in link_beforeyesterday:
# Get the HTML content of the top page
response = requests.get(link)
html = response.content.decode("utf-8")
# Use BeautifulSoup to parse the HTML content
soup = BeautifulSoup(html, "html.parser")
print(link)
mail_list=soup.find_all(string=re.compile(mail_line_pattern))
for mail in mail_list:
# email = re.findall(mail_line_pattern, mail.replace('\n', ''))
# ヘッダーメールアドレスが日本語の場合もあるので、以下に修正
# lstrip()・・・左端の文字(\n)を削除
# strip(mail_line_pattern)・・・"From:"を削除
# split('<')[0]・・・"<"以降を削除
# replace('"', '')・・・ダブルクォーテーションを削除
email = mail.lstrip().strip(mail_line_pattern).split('<')[0].replace('"', '')
env_email = re.findall(env_mail_pattern, mail.replace('\n', ''))
email_list.append(email)
env_email_list.append(env_email)
print(email_list)
print(env_email_list)
subject_list=soup.find_all(string=re.compile(subject_line_pattern))
for title in subject_list:
email_title_line = title.replace('\n', '')
email_title = email_title_line.replace('Subject: ', '')
title_list.append(email_title)
print(title_list)
# 配列の初期化
mail_list = []
email_list = []
env_email_list = []
subject_list = []
title_list = []
2.pythonコード解説
指定した文字列が含まれるhref のリンクをピックアップ
38行目:for a in soup.find_all(“a”,href=re.compile(beforeyesterday_str)): ここで、reライブラリを利用し一昨日の日付が href のリンクの中に含まれているものを抜き出しています。
from datetime import datetime, timedelta
import requests
from bs4 import BeautifulSoup
# Set yesterday's date
yesterday = datetime.now() - timedelta(days=1)
# Initialize an empty list to store the articles
articles_yesterday = []
# URL of the top page
url = "https://www.xxxx.jp/news/"
# Get the HTML content of the top page
response = requests.get(url)
html = response.text
# Use BeautifulSoup to parse the HTML content
soup = BeautifulSoup(html, "html.parser")
# Find all <li> elements in the HTML
for li in soup.find_all("li"):
# Find the <time> element in the <li> element
time_element = li.find("time")
if time_element:
# Extract the date from the "datetime" attribute of the <time> element
date_str = time_element["datetime"].split("T")[0]
date = datetime.strptime(date_str, "%Y-%m-%d")
# Check if the date of the article is yesterday
if date == yesterday:
# Find the <a> element in the <li> element
a_element = li.find("a")
if a_element:
# Extract the title and the link of the article
title = a_element.text
link = a_element["href"]
articles_yesterday.append((title, link))
# Print the articles of yesterday
for title, link in articles_yesterday:
print(f"Title: {title}")
print(f"Link: {link}\n")