2、存储过程校验(可选)
使用 MySQL 存储过程封装校验逻辑:
sql
DELIMITER //
CREATE PROCEDURE InsertCrawlData(
IN p_url VARCHAR(2048),
IN p_status INT
)
BEGIN
IF p_status NOT BETWEEN 100 AND 599 THEN
SIGNAL SQLSTATE '45000' SET MESSAGE_TEXT = 'Invalid HTTP status code';
END IF;
INSERT INTO crawl_data (url, http_status) VALUES (p_url, p_status);
END //
DELIMITER ;
3、触发器校验(谨慎使用)
在 `BEFORE INSERT` 或 `BEFORE UPDATE` 触发器中校验数据:
sql
CREATE TRIGGER before_insert_crawl_data
BEFORE INSERT ON crawl_data
FOR EACH ROW
BEGIN
IF NEW.http_status < 100 OR NEW.http_status 599 THEN
SIGNAL SQLSTATE '45000' SET MESSAGE_TEXT = 'Invalid HTTP status code';
END IF;
END;
四、数据查询阶段的校验
1、完整性校验
查询缺失关键字段的数据:
sql
SELECT * FROM crawl_data WHERE url IS NULL OR domain_id IS NULL;
2、一致性校验
检查状态码与抓取时间是否匹配(如 404 状态码但抓取时间过旧):
sql
SELECT * FROM crawl_data
WHERE http_status = 404 AND last_crawled < DATE_SUB(NOW(), INTERVAL 7 DAY);
3、重复数据校验
查找重复的 URL:
sql
SELECT url, COUNT(*) AS count
FROM crawl_data
GROUP BY url HAVING count 1;
4、格式校验
验证 URL 格式(需应用层或正则支持):
sql
SELECT * FROM crawl_data
WHERE url NOT REGEXP '^(https?://)';
五、自动化校验与监控
1、定期任务(CRON 或事件调度器)
使用 MySQL 事件调度器定期执行校验查询:
sql
CREATE EVENT daily_data_validation
ON SCHEDULE EVERY 1 DAY
DO
BEGIN
示例:检查无效状态码
INSERT INTO validation_logs (error_type, count)
SELECT 'Invalid Status Code', COUNT(*)
FROM crawl_data
WHERE http_status NOT BETWEEN 100 AND 599;
END;