タイトルの通り、何かのhtml タグではなく、ブラウザに表示される文字列のスタート部分とエンド部分を指定して html データを抜き出してみました。
1.Pythonプログラム
# -*- coding: utf-8 -*-
from datetime import datetime, timedelta
import requests
import re
from bs4 import BeautifulSoup
# Set the day before yesterday's date
beforeyesterday = datetime.now() - timedelta(days=17)
beforeyesterday_str = beforeyesterday.strftime("%Y/%m/%d")
# URL of the top page
url = "https://www.xxx.jp/"・・・スクレイピングサイト"/"付き
domain ="https://www.xxx.jp"・・・スクレイピングサイト"/"無し
# start_string と end_string の間のストリングを取得する
start_string = "○○の一例:"
end_string = "○○○した場合は、○○○までご連絡ください。"
# Get the HTML content of the top page
response = requests.get(url)
html = response.content.decode("utf-8")
# Use BeautifulSoup to parse the HTML content
soup = BeautifulSoup(html, "html.parser")
# Find the parent element with the "○○○情報" title
security_info = soup.find('h1', string="○○○情報").parent
#print(security_info)
# Find all <h3> tags within the parent element
h3_tags = security_info.find_all('h3')
filtered_links = [tag.a['href'] for tag in h3_tags if tag.a is not None and beforeyesterday_str in tag.a.text]
for link in filtered_links:
if link.startswith('http'):
print(link)
else:
link = domain + link
print(link)
# トップページから取得した指定日の記事を読み込む
response = requests.get(link)
html = response.content.decode("utf-8")
# BeautifulSoupを使ってHTMLを解析
# soup = BeautifulSoup(html, 'html.parser')
# article = soup.find('article', class_="nc-content-list")
# print(article)
# texts_p = [c.get_text() for c in article.find_all('p')]
# print(texts_p)
# Split the target text on the start and end strings and take the middle part
target_text = html.split(start_string)[1].split(end_string)[0].split("</p>")[0]
print(target_text)
# 改行コードを空文字列に置換して一つのテキストにする
target_text = target_text.replace('\n', '')
# <br />タグを区切り文字として順番に配列に入れる
result_array = [text for text in target_text.split('<br />') if text]
# 結果の出力
print(result_array)
33行目:filtered_links = [tag.a[‘href’] for tag in h3_tags if tag.a is not None and beforeyesterday_str in tag.a.text] ここやたら長いですが、結果的には h3_tagsからbeforeyesterday_strが含まれるhrefのリンクデータのみ抜き出し、filtered_linksに入れてくれているようです。
# -*- coding: utf-8 -*-
from datetime import datetime, timedelta
import requests
import re
from bs4 import BeautifulSoup
# Set the day before yesterday's date
beforeyesterday = datetime.now() - timedelta(days=15)
beforeyesterday_str = beforeyesterday.strftime("%Y%m%d")
mail_line_pattern = "From: \"[a-zA-Z0-9_.+-]+.+[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+\""
mail_pattern = "^[0-9a-zA-Z_.+-]+@[0-9a-zA-Z-]+\.[0-9a-zA-Z-.]+$"
env_mail_pattern = "<+[0-9a-zA-Z_.+-]+@[0-9a-zA-Z-]+\.[0-9a-zA-Z-.]+>"
subject_line_pattern = "Subject:"
# Initialize an empty list to store the articles
articles_beforeyesterday = []
text_beforeyesterday = []
link_beforeyesterday = []
mail_list = []
email_list = []
env_email_list = []
subject_list = []
title_list = []
# URL of the top page
url = "https://www.xxx.jp/news/"
domain ="http://www.xxx.jp"
# Get the HTML content of the top page
response = requests.get(url)
html = response.content.decode("utf-8")
# Use BeautifulSoup to parse the HTML content
soup = BeautifulSoup(html, "html.parser")
# Find all <a href> elements in the HTML
for a in soup.find_all("a",href=re.compile(beforeyesterday_str)):
if a in articles_beforeyesterday:
print("duplicated")
else:
text=a.getText()
link=a.get("href")
articles_beforeyesterday.append(a)
text_beforeyesterday.append(text)
link_beforeyesterday.append(link)
print(link_beforeyesterday)
for link in link_beforeyesterday:
# Get the HTML content of the top page
if link.startswith('http'):
print(link)
else:
link = domain + link
print(link)
response = requests.get(link)
html = response.content.decode("utf-8")
# Use BeautifulSoup to parse the HTML content
soup = BeautifulSoup(html, "html.parser")
h5_tags = soup.find_all('h5', class_='alert_h5')
for tag in h5_tags:
if tag.get_text() == '○○○の件名':
print(tag.find_next('p').get_text())
# -*- coding: utf-8 -*-
from datetime import datetime, timedelta
import requests
import re
from bs4 import BeautifulSoup
# Set the day before yesterday's date
beforeyesterday = datetime.now() - timedelta(days=2)
beforeyesterday_str = beforeyesterday.strftime("%Y%m%d")
# mail_line_pattern = "From: \"[a-zA-Z0-9_.+-]+.+[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+\""
mail_line_pattern = "From:"
mail_pattern = "^[0-9a-zA-Z_.+-]+@[0-9a-zA-Z-]+\.[0-9a-zA-Z-.]+$"
env_mail_pattern = "<+[0-9a-zA-Z_.+-]+@[0-9a-zA-Z-]+\.[0-9a-zA-Z-.]+>"
subject_line_pattern = "Subject:"
# Initialize an empty list to store the articles
articles_beforeyesterday = []
text_beforeyesterday = []
link_beforeyesterday = []
mail_list = []
email_list = []
env_email_list = []
subject_list = []
title_list = []
# URL of the top page
url = "https://www.xxx.jp/news/"・・・スクレイピングするサイトのURL
# Get the HTML content of the top page
response = requests.get(url)
html = response.content.decode("utf-8")
# Use BeautifulSoup to parse the HTML content
soup = BeautifulSoup(html, "html.parser")
# Find all <a href> elements in the HTML
for a in soup.find_all("a",href=re.compile(beforeyesterday_str)):
if a in articles_beforeyesterday:
print("duplicated")
else:
text=a.getText()
link=a.get("href")
articles_beforeyesterday.append(a)
text_beforeyesterday.append(text)
link_beforeyesterday.append(link)
print(articles_beforeyesterday)
for link in link_beforeyesterday:
# Get the HTML content of the top page
response = requests.get(link)
html = response.content.decode("utf-8")
# Use BeautifulSoup to parse the HTML content
soup = BeautifulSoup(html, "html.parser")
print(link)
mail_list=soup.find_all(string=re.compile(mail_line_pattern))
for mail in mail_list:
# email = re.findall(mail_line_pattern, mail.replace('\n', ''))
# ヘッダーメールアドレスが日本語の場合もあるので、以下に修正
# lstrip()・・・左端の文字(\n)を削除
# strip(mail_line_pattern)・・・"From:"を削除
# split('<')[0]・・・"<"以降を削除
# replace('"', '')・・・ダブルクォーテーションを削除
email = mail.lstrip().strip(mail_line_pattern).split('<')[0].replace('"', '')
env_email = re.findall(env_mail_pattern, mail.replace('\n', ''))
email_list.append(email)
env_email_list.append(env_email)
print(email_list)
print(env_email_list)
subject_list=soup.find_all(string=re.compile(subject_line_pattern))
for title in subject_list:
email_title_line = title.replace('\n', '')
email_title = email_title_line.replace('Subject: ', '')
title_list.append(email_title)
print(title_list)
# 配列の初期化
mail_list = []
email_list = []
env_email_list = []
subject_list = []
title_list = []
2.pythonコード解説
指定した文字列が含まれるhref のリンクをピックアップ
38行目:for a in soup.find_all(“a”,href=re.compile(beforeyesterday_str)): ここで、reライブラリを利用し一昨日の日付が href のリンクの中に含まれているものを抜き出しています。
from datetime import datetime, timedelta
import requests
from bs4 import BeautifulSoup
# Set yesterday's date
yesterday = datetime.now() - timedelta(days=1)
# Initialize an empty list to store the articles
articles_yesterday = []
# URL of the top page
url = "https://www.xxxx.jp/news/"
# Get the HTML content of the top page
response = requests.get(url)
html = response.text
# Use BeautifulSoup to parse the HTML content
soup = BeautifulSoup(html, "html.parser")
# Find all <li> elements in the HTML
for li in soup.find_all("li"):
# Find the <time> element in the <li> element
time_element = li.find("time")
if time_element:
# Extract the date from the "datetime" attribute of the <time> element
date_str = time_element["datetime"].split("T")[0]
date = datetime.strptime(date_str, "%Y-%m-%d")
# Check if the date of the article is yesterday
if date == yesterday:
# Find the <a> element in the <li> element
a_element = li.find("a")
if a_element:
# Extract the title and the link of the article
title = a_element.text
link = a_element["href"]
articles_yesterday.append((title, link))
# Print the articles of yesterday
for title, link in articles_yesterday:
print(f"Title: {title}")
print(f"Link: {link}\n")
from dateutil.parser import parse
from pycti import OpenCTIApiClient
from stix2 import TLP_GREEN
# OpenCTI API client initialization
opencti_api_client = OpenCTIApiClient("http://localhost:8080", "<ローカルに立てたOpenCTIのAPI Keyを設定>")
# Define an OpenCTI compatible date
date = parse("2023-07-16").strftime("%Y-%m-%dT%H:%M:%SZ")
# Get the OpenCTI marking for stix2 TLP_GREEN
TLP_GREEN_CTI = opencti_api_client.marking_definition.read(id=TLP_GREEN["id"])
# Use the client to create an indicator in OpenCTI
indicator = opencti_api_client.indicator.create(
name="C2 server of the new campaign",
description="This is the C2 server of the campaign",
pattern_type="stix",
pattern="[IPv4-Addr:value = '100.172.180.181']",
x_opencti_main_observable_type="IPv4-Addr",
valid_from=date,
update=True,
markingDefinitions=[TLP_GREEN_CTI["id"]],
)
$ python3 create_indicator.py
INFO:pycti.entities:Listing Threat-Actors with filters null.
INFO:pycti.entities:Reading Marking-Definition {marking-definition--34098fce-860f-48ae-8e50-ebd3cc5e41da}.
INFO:pycti.entities:Creating Indicator {C2 server of the new campaign}.
from pycti import OpenCTIApiClient
# Variables
api_url = "http://localhost:8080"
api_token = "ローカルに立てたOpenCTIのAPI Keyを設定>"
# OpenCTI initialization
opencti_api_client = OpenCTIApiClient(api_url, api_token)
# Get all reports using the pagination
custom_attributes = """
id
pattern_type
pattern
created
description
"""
final_indicators = []
data = {"pagination": {"hasNextPage": True, "endCursor": None}}
while data["pagination"]["hasNextPage"]:
after = data["pagination"]["endCursor"]
if after:
print("Listing indicators after " + after)
data = opencti_api_client.indicator.list(
first=50,
after=after,
customAttributes=custom_attributes,
withPagination=True,
orderBy="created_at",
orderMode="asc",
)
final_indicators += data["entities"]
for indicator in final_indicators:
print("[" + indicator["created"] + "] " + indicator["pattern"] + ", " + indicator["description"])
そしてこれが実行結果です。
$ python3 get_all_indicators.py
INFO:pycti.entities:Listing Threat-Actors with filters null.
INFO:pycti.entities:Listing Indicators with filters null.
[2023-07-17T00:58:42.733Z] [domain-name:value = 'www.5z8.info'], This is the C2 server of the campaign
[2023-07-17T01:18:55.912Z] [IPv4-Addr:value = '100.172.180.180'], This is the C2 server of the campaign
[2023-07-17T01:34:48.208Z] [IPv4-Addr:value = '100.172.180.181'], This is the C2 server of the campaign