BaleeGraph - gallery view

ID: 1537

RCAST, Univ of Tokyo (in Japanese) Less

kaira

538 views / 1 years

References

Scraped by the following program:

import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import re

def sanitize_filename(filename):
"""
ファイル名に使えない文字（ \ / : * ? " < > | ）を除去する関数
"""
return re.sub(r'[\\/*?:"<>|]', "", filename)

def main():
# 研究テーマ一覧のページURL
base_page_url = "https://www.rcast.u-tokyo.ac.jp/ja/research/researchcategory.html"
try:
response = requests.get(base_page_url)
response.encoding = response.apparent_encoding
except Exception as e:
print(f"Error fetching the main page: {e}")
return

soup = BeautifulSoup(response.text, "html.parser")

# ページ内の全てのリンク（タグ）を取得し、hrefに'/ja/research'が含まれているリンクのみを抽出
all_links = soup.find_all("a", href=True)
research_links = []
is_start = False
for a in all_links:
href = a.get("href")
if 'iwamoto_lab.html' in href:
is_start = True
if '/ja/research' in href and is_start:
research_links.append(a)

print(f"見つかった研究テーマリンクの数: {len(research_links)}")

# 各研究テーマリンクについて、リンク先ページから必要なテキスト部分を取得し、
# リンクのラベル（研究テーマと研究室名）を用いてファイルに保存
i_counter = 1
for a in research_links:
href = a.get("href")
# 絶対URLに変換
link_url = urljoin(base_page_url, href)

# リンクテキストからラベルを取得
link_text = a.get_text(separator="\n").strip()
file_name = f"【研究テーマ {i_counter}】{link_text}.txt"
file_name = sanitize_filename(file_name)
print(f"【処理開始】\n ラベル: {link_text}\n URL: {link_url}\n 保存ファイル名: {file_name}")

i_counter += 1

# リンク先のページを取得
try:
r = requests.get(link_url)
r.encoding = r.apparent_encoding
except Exception as e:
print(f"リンク先の取得に失敗: {link_url}\n エラー: {e}")
continue

page_soup = BeautifulSoup(r.text, "html.parser")

# 必要な部分のみ抽出:
# ① のテキスト
# ② 内のテキスト
title_tag = page_soup.find("h1", class_="title01")
component_tag = page_soup.find("div", class_="component")
text_parts = []
if title_tag:
text_parts.append(title_tag.get_text(separator="\n").strip())
if component_tag:
text_parts.append(component_tag.get_text(separator="\n").strip())
content = "\n\n".join(text_parts)

# ファイルに保存（UTF-8エンコーディング）
try:
with open(file_name, "w", encoding="utf-8") as f:
f.write(content)
print(f"保存完了: {file_name}\n")
except Exception as e:
print(f"ファイル保存に失敗: {file_name}\n エラー: {e}")

if __name__ == "__main__":
main()

Memo

Less

Similar Graphs

RCAST, Univ of Tokyo (in Japanese) 616 views / 1 years	RCAST, Univ of Tokyo: All (in Japanese) Less 671 views / 1 years	Q-LEAP Papers (2022.4-2024.1, Exclude some words) 648 views / 2 years	Q-LEAP Papers (2022.4-2024.1) 590 views / 2 years	Q-LEAP Papers (2022.4-2023.10) 641 views / 2 years
[CSV mode sample] Wider Ver. of Automotive Industry Supply Chain (in Japanese) 682 views / 1 years	Dept of Systems Innovation, Univ of Tokyo 818 views / 2 years	SDGs and Dept of Systems Innovation, Univ of Tokyo 836 views / 2 years	NeurIPS 2023 Paper Awards 781 views / 2 years	Co-JUNKAN 研究開発課題 [Fix] 539 views / 1 years
(Modified) Basic Plan on Space Policy and Dept of Aero and Astro Curriculum, University of Tokyo (in Japanese) 769 views / 2 years	Co-JUNKAN 用語集 573 views / 1 years	C&C '23 Best Papers 711 views / 2 years	Dept of Aero and Astro Curriculum, Univ of Tokyo (in Japanese) 783 views / 2 years	Basic Plan on Space Policy and Dept of Aero and Astro Curriculum, University of Tokyo (in Japanese) 831 views / 2 years