c#宽度优先的网络爬虫

Querida ·
更新时间:2024-11-13
· 696 次阅读

using System; using System.Collections.Generic; using System.ComponentModel; using System.Data; using System.Drawing; using System.Linq; using System.Text; using System.Windows.Forms; using Tool; using System.Net; using System.Text.RegularExpressions; using System.Threading; namespace Search { public partial class Form1 : Form { public Form1() { InitializeComponent(); } /** * 队列,保存将要访问的URL */ public class Queue { //使用链表实现队列 private LinkedList<string> queue = new LinkedList<string>(); //入队列 public void enQueue(string t) { queue.AddLast(t); } //出队列 public string deQueue() { string o = queue.Last.Value; queue.RemoveLast(); return o; } //判断队列是否为空 public bool isQueueEmpty() { return queue.Count > 0 ? false : true; } //判断队列是否包含t public bool contians(string t) { return queue.Contains(t); } public int getcount() { return queue.Count; } } public class LinkQueue { //已访问的url 集合 private static ISet<string> visitedUrl = new HashSet<string>(); //待访问的url 集合 private static Queue unVisitedUrl = new Queue(); //获得URL 队列 public static Queue getUnVisitedUrl() { return unVisitedUrl; } //添加到访问过的URL 队列中 public static void addVisitedUrl(String url) { visitedUrl.Add(url); } //移除访问过的URL public static void removeVisitedUrl(String url) { visitedUrl.Remove(url); } //未访问的URL 出队列 public static Object unVisitedUrlDeQueue() { return unVisitedUrl.deQueue(); } // 保证每个URL 只被访问一次 public static void addUnvisitedUrl(String url) { if (url != null && !url.Trim().Equals("") && !visitedUrl.Contains(url) && !unVisitedUrl.contians(url)) unVisitedUrl.enQueue(url); } //获得已经访问的URL 数目 public static int getVisitedUrlNum() { return visitedUrl.Count; } //判断未访问的URL 队列中是否为空 public static bool unVisitedUrlsEmpty() { return unVisitedUrl.isQueueEmpty(); } } string[] urlarr=new string[100]; private void button1_Click(object sender, EventArgs e) { zzHttp http = new zzHttp(); CookieContainer cookie = new CookieContainer(); string url = textBox1.Text!=""?textBox1.Text:"http://image.baidu.com/"; string content=http.SendDataByGET(url,"",ref cookie); string baseUri = Utility.GetBaseUri(url); string[] links = Parser.ExtractLinks(baseUri, content); foreach (string link in links) { richTextBox1.Text += link; richTextBox1.Text += " "; } Regex regImg = new Regex(@"<img[^<>]*?src[s ]*=[s ]*[""']?[s ]*(?<imgUrl>[^s ""'<>]*)[^<>]*?/?[s ]*>", RegexOptions.IgnoreCase);            // 搜索匹配的字符串 MatchCollection matches = regImg.Matches(content); Queue que = new Queue(); foreach (Match match in matches) que.enQueue(match.Groups["imgUrl"].Value); int k; for (k = 0; k < que.getcount(); k++) { string picurl = que.deQueue(); richTextBox1.Text += picurl; richTextBox1.Text += " "; string[] s = picurl.Split('/'); string picname=s[s.Length - 1]; zzHttp.downfile(picurl, picname, @"d:pic"); } label1.Text = k+"张"; }

//搜索 void search() { int i = 0; LinkQueue.addUnvisitedUrl("http://blog.csdn.net/zhujunxxxxx/"); while (!LinkQueue.unVisitedUrlsEmpty() && LinkQueue.getVisitedUrlNum() <= 1000) { //队头URL 出队列 String visitUrl=(String)LinkQueue.unVisitedUrlDeQueue(); if(visitUrl==null) continue; zzHttp downLoader = new zzHttp(); CookieContainer cookie = new CookieContainer(); //下载网页 string content=downLoader.SendDataByGET(visitUrl,"",ref cookie); //该URL 放入已访问的URL 中 LinkQueue.addVisitedUrl(visitUrl); //提取出下载网页中的URL string baseUri = Utility.GetBaseUri(visitUrl); string[] links = Parser.ExtractLinks(baseUri, content); //新的未访问的URL 入队 i++; Add2Message("已访问数目:" + LinkQueue.getVisitedUrlNum() + ",count=" + LinkQueue.getUnVisitedUrl().getcount()); foreach (string link in links) { if (link.Contains("css") || link.Contains("js") || link.Contains("gif") || link.Contains("jpg") || link.Contains("png") || link.Contains("jpeg")) continue; LinkQueue.addUnvisitedUrl(link); AddMessage(link); } } } private void button2_Click(object sender, EventArgs e) { new Thread(search).Start(); } private delegate void InfoDelegate(string message); public void AddMessage(string message) { if (richTextBox1.InvokeRequired)//不能访问创建委托 { InfoDelegate d = new InfoDelegate(AddMessage); richTextBox1.Invoke(d, new object[] { message}); } else { richTextBox1.AppendText(message + Environment.NewLine); richTextBox1.ScrollToCaret(); } } private delegate void Info2Delegate(string message); public void Add2Message(string message) { if (label2.InvokeRequired)//不能访问创建委托 { Info2Delegate d = new Info2Delegate(Add2Message); label2.Invoke(d, new object[] { message }); } else { label2.Text = message; } } } }



爬虫 C# 网络爬虫

需要 登录 后方可回复, 如果你还没有账号请 注册新账号