MediaWiki:MHDH—cpy.py

import os import requests import re from bs4 import BeautifulSoup

pip install requests
pip install beautifulsoup4

def save_section_content(url, path, domain, file_name):

   # 发送 GET 请求获取网页内容
   response = requests.get(url)
   response.raise_for_status()

   # 解析网页内容
   soup = BeautifulSoup(response.text, 'html.parser')

   # 删除指定的元素
   for element_id in ['cosmos-header-interlang', 'cosmos-articleHeader-actions', 'printfooter', 'cosmos-content-additionalContent', 'CosmosRailWrapper']:
       element = soup.find('div', {'id': element_id}) or soup.find('span', {'id': element_id})
       if element:
           element.decompose()

   # 获取 <section id="mw-content"> 元素
   section = soup.find('section', {'id': 'mw-content'})

   # 处理内容内域名
   section = replace_href_domain(section, domain)

   if section:
       
       if file_name != :
           save_path = os.path.join(path, f'{file_name}.md')
       else:
           # 获取网页标题
           title = soup.title.string.strip()
           # 替换无效的文件名字符
           title = title.replace('/', '-').replace('\\', '-').replace(':', '-').replace('*', '-').replace('?', '-').replace('"', "'").replace('<', '[').replace('>', ']')
           # 构造保存的文件路径
           save_path = os.path.join(path, f'{title}.md')
       
       # 保存 <section> 元素内的内容到文件
       with open(save_path, 'w', encoding='utf-8') as file:
           file.write(section.prettify())

       print(f'内容已保存到文件：{save_path}')
   else:
       print('未找到指定的 <section id="mw-content"> 元素')

def replace_href_domain(content, domain):

   # 查找所有 <a> 标签
   for a in content.find_all('a', href=True):
       # 获取 href 属性的值
       href = a['href']
       # 如果 href 属性的值以 / 开头，则替换为指定域名
       if href.startswith('/'):
           a['href'] = f'{domain}{href}'

   # 返回修改后的 HTML 内容
   return content

def batch_remove_string_in_filename(folder_path, target_string):

   for root, dirs, files in os.walk(folder_path):
       for file_name in files:
           if target_string in file_name:
               new_file_name = file_name.replace(target_string, )
               os.rename(os.path.join(root, file_name), os.path.join(root, new_file_name))
   print(f'已清除“{target_string}”。')

执行

path = './src/MHDH/' domain = '//mhdh.pj568.eu.org'

Hamud

save_section_content('https://mhdh.pj568.eu.org/wiki/%E5%93%88%E5%A7%86', path, domain, 'Hamud')

Index

save_section_content('https://mhdh.pj568.eu.org/wiki/MHDH%E4%B8%96%E7%95%8C%E7%BA%BF', path, domain, 'index')

Languages

save_section_content('https://mhdh.pj568.eu.org/wiki/%E8%AF%AD%E8%A8%80%E5%88%97%E8%A1%A8', path, domain, )

Graph

save_section_content('https://mhdh.pj568.eu.org/wiki/%E6%89%80%E6%9C%89%E5%9B%BE%E4%BE%8B', path, domain, )

Nations

save_section_content('https://mhdh.pj568.eu.org/wiki/MHDH%E7%8E%B0%E5%AD%98%E5%9B%BD%E5%AE%B6%E5%88%97%E8%A1%A8', path, domain, 'Nations')

HITL

save_section_content('https://mhdh.pj568.eu.org/wiki/HITL%E4%B8%96%E7%95%8C%E7%BA%BF', path, domain, 'HITL')

哈姆民族
save_section_content('https://mhdh.pj568.eu.org/wiki/%E5%93%88%E5%A7%86%E6%B0%91%E6%97%8F%E5%88%97%E8%A1%A8', path, domain, )
哈姆文化

save_section_content('https://mhdh.pj568.eu.org/wiki/%E5%93%88%E5%A7%86%E6%96%87%E5%8C%96%E5%88%97%E8%A1%A8', path, domain, )

哈姆方言
save_section_content('https://mhdh.pj568.eu.org/wiki/%E5%93%88%E5%A7%86%E6%96%B9%E8%A8%80%E5%88%97%E8%A1%A8', path, domain, )

batch_remove_string_in_filename(path, ' - MHDH维基')

save_section_content('#####', path, domain)