源码制作网站免费网址注册
源码制作网站,免费网址注册,校园网站建设的开题报告,自己怎么建设收费电影网站引言#xff1a;国际组织公开数据的价值与挑战在全球化日益深入的今天#xff0c;联合国、世界银行、世界卫生组织等国际机构的公开数据已成为研究全球经济、社会发展、公共卫生和环境变化的重要资源。这些数据不仅为政策制定者提供决策依据#xff0c;也为研究人员、企业和…引言国际组织公开数据的价值与挑战在全球化日益深入的今天联合国、世界银行、世界卫生组织等国际机构的公开数据已成为研究全球经济、社会发展、公共卫生和环境变化的重要资源。这些数据不仅为政策制定者提供决策依据也为研究人员、企业和普通公众提供了深入了解世界发展趋势的窗口。然而从这些国际组织网站高效、规范地采集数据面临多重挑战网站结构复杂、数据格式多样JSON、XML、CSV、PDF、反爬虫机制严格以及数据量庞大导致的采集效率问题。本文将详细介绍如何运用Python最新爬虫技术构建一个高效、稳定、可维护的国际组织数据采集系统。技术选型现代Python爬虫技术栈1. 异步编程aiohttp与asyncio传统同步爬虫在I/O等待时浪费大量时间而异步爬虫可以同时处理多个请求显著提高数据采集效率。2. 请求管理httpx与HTTP/2支持httpx是新一代HTTP客户端支持同步和异步请求内置HTTP/2支持能够更高效地与服务器通信。3. 解析工具BeautifulSoup4与parsel对于HTML解析BeautifulSoup4提供友好的API而parselScrapy使用的选择器库在性能上更优支持CSS和XPath选择器。4. 动态页面处理Playwright现代网站大量使用JavaScript动态加载数据Playwright可以模拟真实浏览器环境处理SPA单页应用等复杂场景。5. 数据存储多种格式支持根据数据特点选择存储方式JSON、CSQLite、Parquet或直接存入数据库。6. 代理与限速智能请求策略尊重网站服务器实现智能限速、代理轮换和用户代理伪装。完整爬虫系统架构设计python 国际组织数据采集系统 作者数据科学探索者 创建日期2024年 描述高效采集联合国等国际组织公开数据的完整爬虫系统 import asyncio import aiohttp import httpx import json import pandas as pd import sqlite3 from datetime import datetime, timedelta from typing import Dict, List, Optional, Any from dataclasses import dataclass, asdict from enum import Enum import logging from pathlib import Path import hashlib import time from urllib.parse import urljoin, urlparse, parse_qs, urlencode import csv from bs4 import BeautifulSoup import re from playwright.async_api import async_playwright import nest_asyncio from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type import xml.etree.ElementTree as ET from dataclasses_json import dataclass_json import yaml # 应用nest_asyncio允许在Jupyter等环境中运行嵌套事件循环 nest_asyncio.apply() # 配置日志系统 logging.basicConfig( levellogging.INFO, format%(asctime)s - %(name)s - %(levelname)s - %(message)s, handlers[ logging.FileHandler(un_data_crawler.log, encodingutf-8), logging.StreamHandler() ] ) logger logging.getLogger(__name__) class InternationalOrg(Enum): 国际组织枚举 UN united_nations WORLD_BANK world_bank WHO world_health_organization IMF international_monetary_fund UNESCO unesco FAO fao dataclass_json dataclass class DatasetMetadata: 数据集元数据 org: str dataset_id: str title: str description: str source_url: str last_updated: str format: str keywords: List[str] license: str language: str coverage_period: Optional[str] None geographic_coverage: Optional[str] None record_count: Optional[int] None file_size: Optional[str] None md5_hash: Optional[str] None def generate_id(self) - str: 生成唯一标识符 content f{self.org}_{self.dataset_id}_{self.last_updated} return hashlib.md5(content.encode()).hexdigest() class RateLimiter: 智能速率限制器 def __init__(self, requests_per_minute: int 30): self.requests_per_minute requests_per_minute self.min_interval 60.0 / requests_per_minute self.last_request_time 0 self.semaphore asyncio.Semaphore(requests_per_minute) async def wait_if_needed(self): 如果需要则等待 elapsed time.time() - self.last_request_time if elapsed self.min_interval: await asyncio.sleep(self.min_interval - elapsed) self.last_request_time time.time() async def acquire(self): 获取请求许可 await self.semaphore.acquire() await self.wait_if_needed() def release(self): 释放请求许可 self.semaphore.release() class ProxyManager: 代理管理器 def __init__(self, proxy_list: Optional[List[str]] None): self.proxy_list proxy_list or [] self.current_index 0 self.proxy_failures {} def get_next_proxy(self) - Optional[Dict[str, str]]: 获取下一个可用代理 if not self.proxy_list: return None if self.current_index len(self.proxy_list): self.current_index 0 proxy self.proxy_list[self.current_index] self.current_index 1 # 检查代理是否最近失败过 if proxy in self.proxy_failures: last_failure self.proxy_failures[proxy] if datetime.now() - last_failure timedelta(minutes5): return self.get_next_proxy() return {http: proxy, https: proxy} def mark_proxy_failed(self, proxy: str): 标记代理失败 self.proxy_failures[proxy] datetime.now() def mark_proxy_success(self, proxy: str): 标记代理成功 if proxy in self.proxy_failures: del self.proxy_failures[proxy] class UNDataCrawler: 联合国数据爬虫主类 def __init__(self, base_dir: str ./un_data): self.base_dir Path(base_dir) self.base_dir.mkdir(parentsTrue, exist_okTrue) # 初始化子目录 self.raw_data_dir self.base_dir / raw_data self.processed_data_dir self.base_dir / processed_data self.metadata_dir self.base_dir / metadata self.logs_dir self.base_dir / logs for directory in [self.raw_data_dir, self.processed_data_dir, self.metadata_dir, self.logs_dir]: directory.mkdir(exist_okTrue) # 初始化组件 self.rate_limiter RateLimiter(requests_per_minute20) self.proxy_manager ProxyManager() # 用户代理列表 self.user_agents [ Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36, Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36, Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ] # 会话管理 self.session None self.client None # 配置数据库 self.db_path self.base_dir / un_data.db self.init_database() def init_database(self): 初始化SQLite数据库 conn sqlite3.connect(self.db_path) cursor conn.cursor() # 创建数据集元数据表 cursor.execute( CREATE TABLE IF NOT EXISTS datasets ( id TEXT PRIMARY KEY, org TEXT, dataset_id TEXT, title TEXT, description TEXT, source_url TEXT, last_updated TEXT, format TEXT, keywords TEXT, license TEXT, language TEXT, coverage_period TEXT, geographic_coverage TEXT, record_count INTEGER, file_size TEXT, md5_hash TEXT, downloaded_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, download_status TEXT, file_path TEXT ) ) # 创建下载日志表 cursor.execute( CREATE TABLE IF NOT EXISTS download_logs ( id INTEGER PRIMARY KEY AUTOINCREMENT, dataset_id TEXT, url TEXT, status_code INTEGER, download_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP, file_size INTEGER, duration REAL, error_message TEXT ) ) # 创建索引以提高查询性能 cursor.execute(CREATE INDEX IF NOT EXISTS idx_org ON datasets (org)) cursor.execute(CREATE INDEX IF NOT EXISTS idx_last_updated ON datasets (last_updated)) cursor.execute(CREATE INDEX IF NOT EXISTS idx_download_status ON datasets (download_status)) conn.commit() conn.close() async def init_session(self): 初始化HTTP会话 # 异步会话用于aiohttp connector aiohttp.TCPConnector( limit20, # 连接池大小 limit_per_host5, # 每个主机连接数限制 ttl_dns_cache300, # DNS缓存时间 force_closeTrue # 强制关闭空闲连接 ) self.session aiohttp.ClientSession( connectorconnector, headers{ User-Agent: self.get_random_user_agent(), Accept: application/json, text/html, application/xml;q0.9,*/*;q0.8, Accept-Language: en-US,en;q0.9, Accept-Encoding: gzip, deflate, br, Connection: keep-alive, Upgrade-Insecure-Requests: 1 }, timeoutaiohttp.ClientTimeout(total30) ) # 同步/异步客户端用于httpx self.client httpx.AsyncClient( http2True, # 启用HTTP/2 timeout30.0, follow_redirectsTrue, limitshttpx.Limits(max_keepalive_connections5, max_connections10) ) def get_random_user_agent(self) - str: 获取随机用户代理 import random return random.choice(self.user_agents) retry( stopstop_after_attempt(3), waitwait_exponential(multiplier1, min2, max10), retryretry_if_exception_type((aiohttp.ClientError, asyncio.TimeoutError)) ) async def fetch_page(self, url: str, use_playwright: bool False, **kwargs) - Optional[str]: 获取网页内容 Args: url: 目标URL use_playwright: 是否使用Playwright处理动态页面 **kwargs: 额外参数 Returns: 网页内容或None await self.rate_limiter.acquire() try: headers kwargs.get(headers, {}) headers[User-Agent] self.get_random_user_agent() if use_playwright: return await self.fetch_with_playwright(url) else: proxy self.proxy_manager.get_next_proxy() async with self.session.get(url, headersheaders, proxyproxy) as response: response.raise_for_status() # 检查内容类型 content_type response.headers.get(Content-Type, ) if json in content_type: return await response.json() elif xml in content_type: return await response.text() else: return await response.text(encodingutf-8, errorsignore) except aiohttp.ClientError as e: logger.error(f请求失败 {url}: {str(e)}) # 标记代理失败 proxy_url kwargs.get(proxy) if proxy_url: self.proxy_manager.mark_proxy_failed(proxy_url) raise except Exception as e: logger.error(f未知错误 {url}: {str(e)}) raise finally: self.rate_limiter.release() async def fetch_with_playwright(self, url: str) - str: 使用Playwright获取动态页面内容 async with async_playwright() as p: # 启动浏览器 browser await p.chromium.launch( headlessTrue, args[--disable-blink-featuresAutomationControlled] ) # 创建上下文 context await browser.new_context( user_agentself.get_random_user_agent(), viewport{width: 1920, height: 1080} ) # 创建页面 page await context.new_page() try: # 导航到URL await page.goto(url, wait_untilnetworkidle) # 等待可能的内容加载 await page.wait_for_timeout(2000) # 获取页面内容 content await page.content() return content finally: await browser.close() async def discover_un_datasets(self) - List[DatasetMetadata]: 发现联合国数据集的元数据 Returns: 数据集元数据列表 logger.info(开始发现联合国数据集...) datasets [] # UN Data API端点 un_data_api https://data.un.org/api/ # 联合国数据目录 catalog_urls [ https://data.un.org/Host.aspx?ContentData, https://unstats.un.org/sdgs/indicators/database/, https://data.un.org/Explorer.aspx ] for url in catalog_urls: try: logger.info(f正在爬取数据目录: {url}) html await self.fetch_page(url, use_playwrightTrue) if html: soup BeautifulSoup(html, html.parser) # 查找数据集链接示例选择器实际需要调整 dataset_links soup.find_all(a, hrefre.compile(rDataSet|Dataset|data, re.I)) for link in dataset_links[:10]: # 限制数量用于演示 dataset_url urljoin(url, link.get(href)) # 提取元数据 metadata await self.extract_dataset_metadata(dataset_url) if metadata: datasets.append(metadata) except Exception as e: logger.error(f处理URL {url} 时出错: {str(e)}) continue logger.info(f共发现 {len(datasets)} 个数据集) return datasets async def extract_dataset_metadata(self, url: str) - Optional[DatasetMetadata]: 从数据页面提取元数据 try: html await self.fetch_page(url) if not html: return None soup BeautifulSoup(html, html.parser) # 提取标题 title_elem soup.find(h1) or soup.find(title) title title_elem.get_text(stripTrue) if title_elem else Untitled Dataset # 提取描述 description meta_desc soup.find(meta, attrs{name: description}) if meta_desc: description meta_desc.get(content, ) else: # 尝试查找第一个段落 first_p soup.find(p) if first_p: description first_p.get_text(stripTrue)[:500] # 提取其他元数据 keywords [] keywords_meta soup.find(meta, attrs{name: keywords}) if keywords_meta: keywords [k.strip() for k in keywords_meta.get(content, ).split(,)] # 创建元数据对象 metadata DatasetMetadata( orgInternationalOrg.UN.value, dataset_idself.generate_dataset_id(url), titletitle, descriptiondescription, source_urlurl, last_updateddatetime.now().isoformat(), formatHTML, # 默认格式 keywordskeywords, licenseUN Open Data License, languageen ) return metadata except Exception as e: logger.error(f提取元数据失败 {url}: {str(e)}) return None def generate_dataset_id(self, url: str) - str: 生成数据集ID parsed urlparse(url) path_parts [p for p in parsed.path.split(/) if p] if path_parts: return path_parts[-1].replace(.aspx, ).replace(.html, ) else: return hashlib.md5(url.encode()).hexdigest()[:10] async def download_dataset(self, metadata: DatasetMetadata, output_format: str json) - bool: 下载数据集 Args: metadata: 数据集元数据 output_format: 输出格式 Returns: 下载是否成功 start_time time.time() try: logger.info(f开始下载数据集: {metadata.title}) # 根据URL类型选择下载方法 if metadata.source_url.endswith(.json): data await self.download_json(metadata.source_url) file_ext json elif metadata.source_url.endswith(.xml): data await self.download_xml(metadata.source_url) file_ext xml elif metadata.source_url.endswith(.csv): data await self.download_csv(metadata.source_url) file_ext csv else: # 尝试检测API端点 data await self.detect_and_download_api_data(metadata.source_url) file_ext json if not data: logger.warning(f未找到数据: {metadata.source_url}) return False # 保存数据 file_path await self.save_dataset(metadata, data, file_ext) # 更新元数据 metadata.file_size self.get_file_size(file_path) metadata.md5_hash self.calculate_md5(file_path) # 保存元数据到数据库 await self.save_metadata_to_db(metadata, file_path, success) duration time.time() - start_time logger.info(f数据集下载完成: {metadata.title} ({duration:.2f}秒)) return True except Exception as e: logger.error(f下载数据集失败 {metadata.title}: {str(e)}) # 记录失败状态 await self.save_metadata_to_db(metadata, None, ffailed: {str(e)}) return False async def download_json(self, url: str) - Optional[Dict]: 下载JSON数据 try: async with self.session.get(url) as response: return await response.json() except Exception as e: logger.error(f下载JSON失败 {url}: {str(e)}) return None async def download_xml(self, url: str) - Optional[ET.Element]: 下载XML数据 try: async with self.session.get(url) as response: xml_text await response.text() return ET.fromstring(xml_text) except Exception as e: logger.error(f下载XML失败 {url}: {str(e)}) return None async def download_csv(self, url: str) - Optional[List[Dict]]: 下载CSV数据 try: async with self.session.get(url) as response: csv_text await response.text() # 解析CSV csv_reader csv.DictReader(csv_text.splitlines()) return list(csv_reader) except Exception as e: logger.error(f下载CSV失败 {url}: {str(e)}) return None async def detect_and_download_api_data(self, url: str) - Optional[Any]: 检测并下载API数据 # 尝试常见的数据API模式 api_patterns [ /api/, /data/, /export/, /query, formatjson ] for pattern in api_patterns: if pattern in url.lower(): # 尝试获取JSON数据 try: json_url url if formatjson in url else f{url}?formatjson return await self.download_json(json_url) except: continue # 如果不是API返回页面内容 return await self.fetch_page(url) async def save_dataset(self, metadata: DatasetMetadata, data: Any, file_ext: str) - Path: 保存数据集到文件 # 生成安全文件名 safe_title re.sub(r[^\w\-_], _, metadata.title)[:100] timestamp datetime.now().strftime(%Y%m%d_%H%M%S) filename f{metadata.org}_{safe_title}_{timestamp}.{file_ext} file_path self.raw_data_dir / filename try: if file_ext json: with open(file_path, w, encodingutf-8) as f: json.dump(data, f, ensure_asciiFalse, indent2) elif file_ext xml: if isinstance(data, ET.Element): tree ET.ElementTree(data) tree.write(file_path, encodingutf-8, xml_declarationTrue) else: with open(file_path, w, encodingutf-8) as f: f.write(data) elif file_ext csv: if isinstance(data, list) and len(data) 0: df pd.DataFrame(data) df.to_csv(file_path, indexFalse, encodingutf-8) else: with open(file_path, w, encodingutf-8) as f: f.write(str(data)) else: with open(file_path, w, encodingutf-8) as f: f.write(str(data)) return file_path except Exception as e: logger.error(f保存文件失败 {file_path}: {str(e)}) raise def get_file_size(self, file_path: Path) - str: 获取文件大小 size file_path.stat().st_size # 转换为人类可读格式 for unit in [B, KB, MB, GB]: if size 1024.0: return f{size:.1f} {unit} size / 1024.0 return f{size:.1f} TB def calculate_md5(self, file_path: Path) - str: 计算文件MD5哈希 hash_md5 hashlib.md5() with open(file_path, rb) as f: for chunk in iter(lambda: f.read(4096), b): hash_md5.update(chunk) return hash_md5.hexdigest() async def save_metadata_to_db(self, metadata: DatasetMetadata, file_path: Optional[Path], status: str): 保存元数据到数据库 conn sqlite3.connect(self.db_path) cursor conn.cursor() try: cursor.execute( INSERT OR REPLACE INTO datasets (id, org, dataset_id, title, description, source_url, last_updated, format, keywords, license, language, coverage_period, geographic_coverage, record_count, file_size, md5_hash, download_status, file_path) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) , ( metadata.generate_id(), metadata.org, metadata.dataset_id, metadata.title, metadata.description, metadata.source_url, metadata.last_updated, metadata.format, json.dumps(metadata.keywords), metadata.license, metadata.language, metadata.coverage_period, metadata.geographic_coverage, metadata.record_count, metadata.file_size, metadata.md5_hash, status, str(file_path) if file_path else None )) conn.commit() except Exception as e: logger.error(f保存元数据到数据库失败: {str(e)}) conn.rollback() finally: conn.close() async def crawl_world_bank_data(self, indicators: List[str] None): 爬取世界银行数据 Args: indicators: 指标列表如 [NY.GDP.MKTP.CD, SP.POP.TOTL] if indicators is None: indicators [NY.GDP.MKTP.CD, SP.POP.TOTL, SH.DYN.MORT] base_url https://api.worldbank.org/v2/country/all/indicator for indicator in indicators: url f{base_url}/{indicator}?formatjsonper_page10000 try: logger.info(f正在下载世界银行指标: {indicator}) data await self.download_json(url) if data: # 创建元数据 metadata DatasetMetadata( orgInternationalOrg.WORLD_BANK.value, dataset_idfWB_{indicator}, titlefWorld Bank {indicator} Data, descriptionfWorld Bank development indicator: {indicator}, source_urlurl, last_updateddatetime.now().isoformat(), formatJSON, keywords[world bank, development, indicator, indicator], licenseCC BY 4.0, languageen ) # 保存数据 await self.save_dataset(metadata, data, json) except Exception as e: logger.error(f下载世界银行数据失败 {indicator}: {str(e)}) async def crawl_who_data(self): 爬取世界卫生组织数据 who_api https://ghoapi.azureedge.net/api/ endpoints [ Indicator, Dimension, Measure ] for endpoint in endpoints: url f{who_api}{endpoint} try: logger.info(f正在下载WHO数据: {endpoint}) data await self.download_json(url) if data: metadata DatasetMetadata( orgInternationalOrg.WHO.value, dataset_idfWHO_{endpoint}, titlefWHO Global Health Observatory - {endpoint}, descriptionfWHO Global Health Observatory data for {endpoint}, source_urlurl, last_updateddatetime.now().isoformat(), formatJSON, keywords[WHO, health, global, endpoint.lower()], licenseCC BY-NC-SA 3.0 IGO, languageen ) await self.save_dataset(metadata, data, json) except Exception as e: logger.error(f下载WHO数据失败 {endpoint}: {str(e)}) async def crawl_multiple_organizations(self): 同时爬取多个国际组织数据 tasks [] # 联合国数据 tasks.append(self.discover_un_datasets()) # 世界银行数据 tasks.append(self.crawl_world_bank_data()) # WHO数据 tasks.append(self.crawl_who_data()) # 并发执行所有任务 results await asyncio.gather(*tasks, return_exceptionsTrue) # 处理结果 successful 0 failed 0 for result in results: if isinstance(result, Exception): logger.error(f爬取任务失败: {str(result)}) failed 1 else: successful 1 logger.info(f爬取完成: {successful} 成功, {failed} 失败) def export_metadata_report(self, output_format: str csv): 导出元数据报告 conn sqlite3.connect(self.db_path) try: df pd.read_sql_query(SELECT * FROM datasets, conn) if output_format.lower() csv: output_path self.metadata_dir / datasets_metadata.csv df.to_csv(output_path, indexFalse, encodingutf-8) elif output_format.lower() excel: output_path self.metadata_dir / datasets_metadata.xlsx df.to_excel(output_path, indexFalse) elif output_format.lower() json: output_path self.metadata_dir / datasets_metadata.json df.to_json(output_path, orientrecords, indent2) logger.info(f元数据报告已导出到: {output_path}) finally: conn.close() async def cleanup(self): 清理资源 if self.session: await self.session.close() if self.client: await self.client.aclose() logger.info(爬虫清理完成) async def main(): 主函数 crawler UNDataCrawler(base_dir./international_org_data) try: await crawler.init_session() # 1. 发现并下载联合国数据 logger.info(开始联合国数据采集...) datasets await crawler.discover_un_datasets() # 2. 下载前5个数据集 download_tasks [] for dataset in datasets[:5]: task crawler.download_dataset(dataset) download_tasks.append(task) await asyncio.gather(*download_tasks) # 3. 爬取世界银行数据 await crawler.crawl_world_bank_data() # 4. 爬取WHO数据 await crawler.crawl_who_data() # 5. 导出元数据报告 crawler.export_metadata_report(csv) logger.info(国际组织数据采集完成) except KeyboardInterrupt: logger.info(用户中断爬虫运行) except Exception as e: logger.error(f爬虫运行失败: {str(e)}) import traceback traceback.print_exc() finally: await crawler.cleanup() if __name__ __main__: # 运行主函数 asyncio.run(main())高级功能扩展1. 分布式爬虫架构pythonimport redis import rq from celery import Celery from distributed import Client import dask class DistributedUNDataCrawler(UNDataCrawler): 分布式联合国数据爬虫 def __init__(self, redis_urlredis://localhost:6379/0, **kwargs): super().__init__(**kwargs) self.redis_client redis.from_url(redis_url) self.task_queue rq.Queue(un_data_crawl, connectionself.redis_client) def enqueue_crawl_task(self, org: str, dataset_id: str): 将爬取任务加入队列 job self.task_queue.enqueue( self.download_dataset_by_id, org, dataset_id, job_timeout3600 ) return job.id def create_dask_cluster(self, n_workers4): 创建Dask集群 from dask.distributed import LocalCluster self.cluster LocalCluster(n_workersn_workers, threads_per_worker1) self.dask_client Client(self.cluster) return self.dask_client async def parallel_crawl_datasets(self, dataset_urls: List[str]): 并行爬取多个数据集 import dask.bag as db # 使用Dask并行处理 bag db.from_sequence(dataset_urls, npartitions4) results bag.map(self.process_dataset_url).compute() return results2. 数据质量检查pythonclass DataQualityChecker: 数据质量检查器 staticmethod def check_completeness(df: pd.DataFrame) - Dict[str, float]: 检查数据完整性 total_cells df.size missing_cells df.isnull().sum().sum() completeness 1 - (missing_cells / total_cells) return { completeness: completeness, missing_cells: int(missing_cells), total_cells: total_cells } staticmethod def check_consistency(df: pd.DataFrame, rules: Dict) - List[Dict]: 检查数据一致性 violations [] for column, rule in rules.items(): if column in df.columns: if rule.get(type) numeric: min_val rule.get(min) max_val rule.get(max) if min_val is not None: below_min df[df[column] min_val] if len(below_min) 0: violations.append({ column: column, rule: fmin{min_val}, violations: len(below_min) }) if max_val is not None: above_max df[df[column] max_val] if len(above_max) 0: violations.append({ column: column, rule: fmax{max_val}, violations: len(above_max) }) return violations staticmethod def check_timeliness(last_update: str, threshold_days: int 365) - bool: 检查数据时效性 from dateutil.parser import parse try: update_date parse(last_update) days_since_update (datetime.now() - update_date).days return days_since_update threshold_days except: return False3. 数据可视化与探索pythonimport plotly.express as px import plotly.graph_objects as go from plotly.subplots import make_subplots import dash from dash import dcc, html import dash_bootstrap_components as dbc class DataVisualizer: 数据可视化器 staticmethod def create_dashboard(data_path: Path): 创建交互式数据看板 app dash.Dash(__name__, external_stylesheets[dbc.themes.BOOTSTRAP]) # 加载数据 df pd.read_csv(data_path) app.layout dbc.Container([ dbc.Row([ dbc.Col([ html.H1(国际组织数据看板, classNametext-center mb-4) ]) ]), dbc.Row([ dbc.Col([ dcc.Graph( idtime-series-plot, figureDataVisualizer.create_time_series_plot(df) ) ], width6), dbc.Col([ dcc.Graph( idgeo-plot, figureDataVisualizer.create_geo_plot(df) ) ], width6) ]), dbc.Row([ dbc.Col([ dcc.Graph( iddistribution-plot, figureDataVisualizer.create_distribution_plot(df) ) ]) ]) ], fluidTrue) return app staticmethod def create_time_series_plot(df: pd.DataFrame) - go.Figure: 创建时间序列图 fig px.line(df, xyear, yvalue, colorcountry, title数据趋势图) fig.update_layout(height500) return fig staticmethod def create_geo_plot(df: pd.DataFrame) - go.Figure: 创建地理分布图 fig px.choropleth(df, locationscountry_code, colorvalue, hover_namecountry, title全球数据分布) fig.update_layout(height500) return fig