How To Fetch All Links From a URL

bysantosh •August 07, 2008

0

<%@ Page Language="C#" AutoEventWireup="true" CodeFile="FetchLink.aspx.cs" Inherits="FetchLink" %>

<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head runat="server">
   <title>Untitled Page</title>
</head>
<body>
   <form id="form1" runat="server">
       <div>
           <table style="width: 50%">
               <tr>
                   <td style="width: 100px">
                       Enter URL</td>
                   <td style="width: 100px">
                       <asp:TextBox ID="TextBox1" runat="server" Width="243px"></asp:TextBox>
                       <asp:Button ID="Button1" runat="server" Text="Fetch Link" OnClick="Button1_Click" /></td>
               </tr>
               <tr>
                   <td style="width: 100px">
                   </td>
                   <td style="width: 100px">
                       <asp:GridView ID="GridView1" runat="Server">
                       </asp:GridView>
                   </td>
               </tr>
           </table>
       </div>
   </form>
</body>
</html>

using System;
using System.Data;
using System.Configuration;
using System.Collections;
using System.Web;
using System.Web.Security;
using System.Web.UI;
using System.Web.UI.WebControls;
using System.Web.UI.WebControls.WebParts;
using System.Web.UI.HtmlControls;
using System.Collections.Generic;
using System.Text.RegularExpressions;
using System.Net;
using System.IO;

public partial class FetchLink : System.Web.UI.Page
{
    protected void Page_Load(object sender, EventArgs e)
    {

    }
    public static List<string> GetAllUrlsFromUri(Uri urlToScrape)
    {
        //the list that would hold the urls
        List<string> listOfUrls = new List<string>();
        //the search pattern that we are going to use for our regular expression
        string searchPattern = "href\\s*=\\s*(?:(?:\\\"(?<url>[^\\\"]*)\\\")|(?<url>[^\\s]* ))";

        //get the contents of the page and put it to a string
        string pageContents = GetPageContents(urlToScrape);

        //our regular expression should ignore case
        Regex regEx = new Regex(searchPattern, RegexOptions.IgnoreCase);

        //get all the maching values generated by our regular expression
        Match match = regEx.Match(pageContents);

        //loop thru all the matching strings
        while (match.Success)
        {
            //assign the match value to a temporary placeholder
            string urlFound = match.Value;

            //check to see if the url does not include the full path(e.g: default.aspx)
            if (listOfUrls.IndexOf(urlFound) < 0)
            {
                string urlToAdd = urlFound;
                if (urlFound.StartsWith("href=\"javascript:"))
                {
                    //do nothing, we need to display it as is.
                }
                else if (urlFound.StartsWith("href=\"/") || !urlFound.StartsWith("href=\"http://"))
                {
                    //add the scrape url to the beginning of our found string
                    urlToAdd = urlFound.Insert(6, urlToScrape.OriginalString);
                }
                //add the url to our list
                listOfUrls.Add(urlToAdd);
            }
            //move to the next match result
            match = match.NextMatch();
        }

        //return the list of urls that we have recovered from the site
        return listOfUrls;
    }

    /// <summary>
    /// Reads a webpage and captures it html representation into a string
    /// </summary>
    /// <param name="urlToScrape">the website you want to read</param>
    /// <returns>the html representation of the site</returns>
    private static string GetPageContents(Uri urlToScrape)
    {
        HttpWebResponse httpWebResponse = null;
        StreamReader streamReader = null;
        string pageContents = String.Empty;

        try
        {
            //create a webrequest object for the url
            WebRequest webRequest = WebRequest.Create(urlToScrape);
            //convert the webrequest to an httpwebrequest
            HttpWebRequest httpWebRequest = (HttpWebRequest)webRequest;
            //assign a timeout value for the process
            httpWebRequest.Timeout = 100000;

            //create a webresponse object to hold the response generated for our request
            WebResponse webResponse = httpWebRequest.GetResponse();
            //convert the webresponse to httpwebresponse
            httpWebResponse = (HttpWebResponse)webResponse;

            //get the response stream and assign it to our streamreader
            streamReader = new StreamReader(httpWebResponse.GetResponseStream());

            //read the contents of the stream
            pageContents = streamReader.ReadToEnd();
        }
        catch (Exception ex)
        {
            //buble up the error
            throw ex;
        }
        finally
        {
            //close our webresponse object
            httpWebResponse.Close();
            //close our streamreader object
            streamReader.Close();
        }

        //return the page contents
        return pageContents;
    }

    /// <summary>
    /// Saves our list of urls to a text file
    /// </summary>
    /// <param name="listOfUrls">the list containing the urls</param>
    /// <returns>the filename created for the file</returns>
    public static string SaveToFile(List<string> listOfUrls)
    {
        //the file name
        string fileName = String.Format("{0}.{1}", Guid.NewGuid(), "txt");

        //create a streamwriter for our file
        StreamWriter sw = File.CreateText(fileName);

        //loop thru each string in our collection
        foreach (string url in listOfUrls)
        {
            //write the string to our file
            sw.WriteLine(url);
        }

        //close oour streamwriter
        sw.Close();

        //return our filename
        return fileName;
    }
    protected void Button1_Click(object sender, EventArgs e)
    {
        //the url to scrape
        Uri urlToScrape = new Uri(TextBox1.Text);
        //the list that would contain the urls recovered from the specified uri
        List<string> listOfUrls = GetAllUrlsFromUri(urlToScrape);

        GridView1.DataSource = listOfUrls;
        GridView1.DataBind();
    }
}

Tags: ASP.NET

How To Fetch All Links From a URL

Post a Comment

Blog ads

CodeGuru

Contact form