在開始製作這樣的程式之前,首先要決定用甚麼方法來Parse HTML內容,最精簡快速的方法當然是用Regular Expression,可以很快的篩選出要找的HTML TAG內容,但是Regular Expression的語法門檻較高,對於不常使用的人來說實在不容易上手。由於這裡不是要做商業應用,所以我選擇了一個Open Source的元件「HtmlAgilityPack」,它的好處是可以用類似XmlDocument的方式,以XPath存取HTML Object,方便好用,效能也不錯。
利用WebClient取得網頁內容,同時要考慮網頁Encoding的問題,現在的網頁大都使用UTF8,但還是有很多例外,所以需要根據Content-Type以及charset做判斷,以使用正確的Encoding來處理HTML內容(這邊偷懶使用WebClient且有可能會需要Download兩次,正規的做法還是應該用HttpWebRequest),取得HTML之後就可交由HtmlAgilityPack來Parse
1: private static HtmlDocument GetHtmlDoc(Uri uri)
2: {
3: WebClient client = new WebClient();
4: client.Encoding = Encoding.UTF8;
5: string html = client.DownloadString(uri);
6: string contentType = client.ResponseHeaders.Get("Content-Type");
7: Encoding e = AutoEncoding(html, contentType);
8: if (client.Encoding != e)
9: {
10: client.Encoding = e;
11: html = client.DownloadString(uri);
12: }
13: html = System.Text.RegularExpressions.Regex.Replace(html, "(\\n|\\r|\\t)", "");
14: HtmlDocument doc = new HtmlDocument();
15: doc.LoadHtml(html);
16: return doc;
17: }
The Open Graph protocol enables any web page to become a rich object in a social graph. For instance, this is used on Facebook to allow any web page to have the same functionality as any other object on Facebook.其中幾個常用的meta tag正可以讓我們取得想要的資訊:
- og:title:網頁的標題
- og:description:網頁內文摘要
- og:image:圖片網址
- og:site_name:網站名稱
1: <head>
2: <title>網頁標題</title>
3: <meta property="og:title" content="網頁標題"/>
4: <meta property="og:description" content="內文摘要"/>
5: <meta property="og:image" content="圖片網址"/>
6: <meta property="og:site_name" content="網站名稱"/>
7: ...
- <title>:網頁的標題
- <meta name=”description” content=”網頁內文摘要" />
- <meta name=”thumbnail” content=”圖片網址" />
- <link ref=”image_src” href=”圖片網址" />
1: private static void ParseHead(HtmlDocument doc, ref PageInfo info)
2: {
3: string value = "";
4: string image = "";
5: HtmlNodeCollection nodes = doc.DocumentNode.SelectNodes("//head/meta");
6: foreach (HtmlNode node in nodes)
7: {
8: switch (node.GetAttributeValue("name", ""))
9: {
10: case "thumbnail":
11: value = node.GetAttributeValue("content", "");
12: if (!string.IsNullOrEmpty(value)) image = value;
13: break;
14: case "title":
15: value = node.GetAttributeValue("content", "");
16: if (!string.IsNullOrEmpty(value)) info.Title = value;
17: break;
18: case "description":
19: value = node.GetAttributeValue("content", info.Content);
20: if (!string.IsNullOrEmpty(value)) info.Content = value;
21: break;
22: }
23: switch (node.GetAttributeValue("property", ""))
24: {
25: case "og:image":
26: image = node.GetAttributeValue("content", "");
27: break;
28: case "og:title":
29: info.Title = node.GetAttributeValue("content", "");
30: break;
31: case "og:description":
32: info.Content = node.GetAttributeValue("content", "");
33: break;
34: case "og:site_name":
35: info.Site = node.GetAttributeValue("content", "");
36: break;
37: }
38: }
39: nodes = doc.DocumentNode.SelectNodes("//link");
40: foreach (HtmlNode node in nodes)
41: {
42: switch (node.GetAttributeValue("rel", ""))
43: {
44: case "image_src":
45: image = node.GetAttributeValue("href", "");
46: break;
47: }
48: }
49: if(!string.IsNullOrEmpty(image)) info.Images.Add(image);
50: }
1: private static PageInfo GetPageInfo(PageInfo info, Uri uri, HtmlDocument doc)
2: {
3: ParseHead(doc, ref info);
4:
5: if (string.IsNullOrEmpty(info.Site))
6: {
7: info.Site = uri.Host;
8: }
9: if(string.IsNullOrEmpty(info.Title))
10: {
11: HtmlNode nodeTitle = doc.DocumentNode.SelectSingleNode("//head/title");
12: info.Title = nodeTitle.InnerText;
13: }
14: if (string.IsNullOrEmpty(info.Content))
15: {
16: HtmlNodeCollection nodes = doc.DocumentNode.SelectNodes("//p");
17: foreach (HtmlNode node in nodes)
18: {
19: if (!string.IsNullOrWhiteSpace(node.InnerText) && node.InnerText.Length>65)
20: {
21: if (node.InnerText.Length > 200) info.Content = node.InnerText.Substring(0, 200) + "...";
22: else info.Content = node.InnerText;
23: break;
24: }
25: }
26: }
27: //if (info.Images.Count == 0)
28: {
29: HtmlNodeCollection nodes = doc.DocumentNode.SelectNodes("//img");
30: foreach (HtmlNode node in nodes)
31: {
32: string src = node.GetAttributeValue("src", "").ToLower();
33: if(src.EndsWith("png") || src.EndsWith("jpg") || src.EndsWith("gif"))
34: info.Images.Add(GetAbsoluteUrl(uri, node.GetAttributeValue("src", "")));
35: }
36: }
37: return info;
38: }
1: private static PageInfo GetPageInfo_ChineseEngadget(PageInfo info, Uri uri, HtmlDocument doc)
2: {
3: info.Site = "Engadget中文版";
4:
5: HtmlNode nodeTitle = doc.DocumentNode.SelectSingleNode("//head/title");
6: info.Title = nodeTitle.InnerText;
7:
8: HtmlNode nodeBody = doc.DocumentNode.SelectSingleNode("//div[@class='postbody']");
9: HtmlNode nodediv = nodeBody.SelectSingleNode(".//div[2]");
10: info.Content = nodediv.InnerText;
11:
12: HtmlNode nodeImage = nodeBody.SelectSingleNode(".//img");
13: info.Images.Add(GetAbsoluteUrl(uri, nodeImage.GetAttributeValue("src", "")));
14:
15: return info;
16: }
沒有留言:
張貼留言