今天我们来讲讲如何使用.NET开源(MIT License)的轻量、灵活、高性能、跨平台的分布式网络爬虫框架DotnetSpider来快速实现网页数据抓取功能。
注意:为了自身安全请在国家法律允许范围内开发网页爬虫功能。
本文我们以抓取博客园10天推荐排行榜第一页的文章标题、文章简介和文章地址为示例,并把抓取下来的数据保存到对应的txt文本中。
图片
创建名为DotnetSpiderExercise的控制台应用。
图片
图片
图片
NuGet包管理器搜索:DotnetSpider
图片
NuGet包管理器搜索:Serilog.AspNetCore
图片
namespace DotnetSpiderExercise{ public class RecommendedRankingModel { /// <summary> /// 文章标题 /// </summary> public string ArticleTitle { get; set; } /// <summary> /// 文章简介 /// </summary> public string ArticleSummary { get; set; } /// <summary> /// 文章地址 /// </summary> public string ArticleUrl { get; set; } }}
网页数据抓取的业务逻辑都在这里面。
using DotnetSpider.DataFlow.Parser;using DotnetSpider.DataFlow;using DotnetSpider.Downloader;using DotnetSpider.Http;using DotnetSpider.Scheduler.Component;using DotnetSpider.Selector;using DotnetSpider;using Microsoft.Extensions.Logging;using Microsoft.Extensions.Options;using Serilog;using DotnetSpider.Scheduler;using Microsoft.Extensions.Hosting;using System.Reflection;namespace DotnetSpiderExercise{ public class RecommendedRankingSpider : Spider { public RecommendedRankingSpider(IOptions<SpiderOptions> options, DependenceServices services, ILogger<Spider> logger) : base(options, services, logger) { } public static async Task RunAsync() { var builder = Builder.CreateDefaultBuilder<RecommendedRankingSpider>(); builder.UseSerilog(); builder.UseDownloader<HttpClientDownloader>(); builder.UseQueueDistinctBfsScheduler<HashSetDuplicateRemover>(); await builder.Build().RunAsync(); } protected override async Task InitializeAsync(CancellationToken stoppingToken = default) { //添加自定义解析 AddDataFlow(new Parser()); //使用控制台存储器 AddDataFlow(new ConsoleStorage()); //添加采集请求:博客园10天推荐排行榜 await AddRequestsAsync(new Request("https://www.cnblogs.com/aggsite/topdiggs") { //请求超时10秒 Timeout = 10000 }); } class Parser : DataParser { public override Task InitializeAsync() { return Task.CompletedTask; } protected override Task ParseAsync(DataFlowContext context) { var recommendedRankingList = new List<RecommendedRankingModel>(); // 网页数据解析 var number = 1; var recommendedList = context.Selectable.SelectList(Selectors.XPath(".//article[@class='post-item']")); foreach (var news in recommendedList) { var articleTitle = news.Select(Selectors.XPath(".//a[@class='post-item-title']"))?.Value; var articleSummary = news.Select(Selectors.XPath(".//p[@class='post-item-summary']"))?.Value?.Replace("/n", "").Replace(" ", ""); var articleUrl = news.Select(Selectors.XPath(".//a[@class='post-item-title']/@href"))?.Value; Console.WriteLine($"第{number}篇文章 标题:{articleTitle}"); recommendedRankingList.Add(new RecommendedRankingModel { ArticleTitle = articleTitle, ArticleSummary = articleSummary, ArticleUrl = articleUrl }); number++; } using (StreamWriter sw = new StreamWriter("RecommendedRanking.txt")) { foreach (RecommendedRankingModel model in recommendedRankingList) { string line = $"文章标题:{model.ArticleTitle}/r/n文章简介:{model.ArticleSummary}/r/n文章地址:{model.ArticleUrl}"; sw.WriteLine(line + "/r/n ========================================================================================== /r/n"); } } return Task.CompletedTask; } } }}
namespace DotnetSpiderExercise{ public class Program { static async Task Main(string[] args) { Console.WriteLine("网页数据抓取开始..."); await RecommendedRankingSpider.RunAsync(); Console.WriteLine("网页数据抓取完成..."); } }}
图片
图片
图片
更多项目实用功能和特性欢迎前往项目开源地址查看
本文链接://www.dmpip.com//www.dmpip.com/showinfo-26-90186-0.html我们一起聊聊.NET快速实现网页数据抓取
声明:本网页内容旨在传播知识,若有侵权等问题请及时与本网联系,我们将在第一时间删除处理。邮件:2376512515@qq.com