Screen Scraping Made Easy





1
Date Submitted Mon. Apr. 16th, 2007 9:34 PM
Revision 1 of 1
Beginner leroi
Tags API | C | scraping | Screen | webrequest | webresponse
Comments 4 comments
Shows a few objects I built to perform some screen scraping...


namespace LeroyNetSolutions.Net
{
    #region [ Using Directives ]

    using System;
    using System.Collections.Generic;
    using System.Text;
    using System.Net;
    using System.IO;
    using System.Web;

    #endregion


    #region [ WebProcessor ]

    /// <summary>
    ///     An object used to process web pages.
    ///     e.g: a Post or Get request to a specified Url/Uri.
    /// </summary>
    public class WebProcessor
    {
        /// <summary>
        ///     Creates and prepares a new HttpWebRequest Object.
        /// </summary>
        /// <param name="info">
        ///     The object used to store information about the session
        ///     and information about the link to retrieve or post data to.
        /// </param>
        /// <returns>
        ///     Returns the newly created HttpWebRequest Object ready for consumption.
        /// </returns>
        public static HttpWebRequest CreateWebRequest(HttpRequestInfo info)
        {
            HttpWebRequest request = null;
            try
            {
                request = (HttpWebRequest)WebRequest.Create(info.Url);

                if (info.Method == HttpMethod.Post)
                    request.Method = "POST";
                else
                    request.Method = "GET";

                if (!String.IsNullOrEmpty(info.ContentType))
                    request.ContentType = info.ContentType;

                if (null != info.Cookies)
                    request.CookieContainer = info.Cookies;

                request.AllowAutoRedirect = info.AllowAutoRedirect;
                request.KeepAlive = info.KeepAlive;
                request.MaximumAutomaticRedirections = info.MaxRedirects;

                if (null != info.Proxy)
                {
                    WebProxy proxy = new WebProxy(info.Proxy.Url, info.Proxy.Port);
                    if (!String.IsNullOrEmpty(info.Proxy.UserName) && !String.IsNullOrEmpty(info.Proxy.UserName))
                        request.Proxy.Credentials = new NetworkCredential(info.Proxy.UserName, info.Proxy.Password);
                }

                if (!String.IsNullOrEmpty(info.Referer))
                    request.Referer = info.Referer;

                if (!String.IsNullOrEmpty(info.UserAgent))
                    request.UserAgent = info.UserAgent;

                if (info.Credentials != null)
                {
                    request.PreAuthenticate = true;
                    request.Credentials = info.Credentials;
                }

                if (info.TimeOut > 0)
                    request.Timeout = info.TimeOut;

                if (null != info.RequestData)
                    if (info.RequestData.Length > 0)
                    {
                        request.ContentLength = info.RequestData.Length;
                        Stream stream = request.GetRequestStream();
                        stream.Write(info.RequestData, 0, info.RequestData.Length);
                        stream.Flush();
                        stream.Close();
                    }

                return request;
            }
            catch
            {
                throw;
            }
        }

        /// <summary>
        ///     Creates and encodes information used in form data to process
        ///     a user's request for a specified url. Usually a users's password and user name.
        /// </summary>
        /// <param name="keyVal">
        ///     The argument in key value pairs that pertains to the form that needs processing.
        ///     e.g: Email=yourEmail@Email.com&Password=yourPassword
        ///     This will be encoding use UrlEncode format.
        /// </param>
        /// <returns>
        ///     Returns the proper UrlEncoded key value pairs that the site would expect to see.
        ///</returns>
        public static Byte[] CreateFormData(Dictionary<String, String> keyVal)
        {
            StringBuilder sb = new StringBuilder();
            foreach (String key in keyVal.Keys)
            {
                sb.Append(String.Format("{0}={1}&", HttpUtility.UrlEncode(key), HttpUtility.UrlEncode(keyVal[key])));
            }
            return Encoding.Default.GetBytes(sb.ToString());
        }

        public static Byte[] CreateFormData(String userName, String password)
        {
            String temp = String.Format("email={0}&password={1}&", HttpUtility.UrlEncode(userName), HttpUtility.UrlEncode(password));
            return Encoding.Default.GetBytes(temp);
        }

        /// <summary>
        ///     Performs a post to a given Url with the proper UrlEncoded key value pairs/string.
        /// </summary>
        /// <param name="info">
        ///     The object that contains all the post data in byte format user name password and the Url plus more.
        ///     To be used in conjunction with the post request.
        /// </param>
        /// <returns>
        ///     Returns the Html of the response from the server.
        ///</returns>
        public static String PerformPost(HttpRequestInfo info)
        {
            info.Method = HttpMethod.Post;
            return GetHtml(info);
        }

        /// <summary>
        ///     Performs a normal GET request to a server/url.
        /// </summary>
        /// <param name="info">
        ///     Information about the Get Request.
        /// </param>
        /// <returns>
        ///     Returns the html that the user requested.
        /// </returns>
        public static String PerformGet(HttpRequestInfo info)
        {
            info.Method = HttpMethod.Get;
            return GetHtml(info);
        }

        public static String PerformGet(String url, CookieContainer cookies)
        {
            HttpRequestInfo info = new HttpRequestInfo();
            info.Url = url;
            info.Cookies = cookies;
            info.MaxRedirects = 5;
            info.Method = HttpMethod.Get;
            return GetHtml(info);
        }

        /// <summary>
        ///     The private factory that performs the abstracted operation
        ///     used in conjunction with Post and Get Requests.
        /// </summary>
        /// <param name="info">
        ///     Information used for either a post or get request.
        /// </param>
        /// <returns>
        ///     Returns the html the user requested.
        /// </returns>
        private static String GetHtml(HttpRequestInfo info)
        {
            HttpWebRequest request = null;
            HttpWebResponse response = null;
            StreamReader sr = null;
            try
            {
                request = CreateWebRequest(info);
                response = (HttpWebResponse)request.GetResponse();

                sr = new StreamReader(response.GetResponseStream());
                return sr.ReadToEnd();
            }
            finally
            {
                if (null != request)
                {
                    request.Abort();
                    request = null;
                }
                if (null != response)
                    response.Close();
            }
        }
    }

    #endregion


    #region [ HttpMethod ]

    public enum HttpMethod
    {
        Get,
        Post
    }

    #endregion
}


namespace LeroyNetSolutions.Net
{
    #region [ Using Directives ]

    using System;
    using System.Collections.Generic;
    using System.Text;
    using System.Net;

    #endregion


    #region [ HttpRequestInfo ]

    /// <summary>
    ///     An object used for a Get/Post request to the user defined Url/Uri.
    /// </summary>
    public class HttpRequestInfo
    {
        #region [ Properties ]

        private String _Url = default(String);
        public String Url
        {
            get { return _Url; }
            set { _Url = value; }
        }

        private HttpMethod _Method = HttpMethod.Get;
        public HttpMethod Method
        {
            get { return _Method; }
            set { _Method = value; }
        }

        private String _ContentType = "application/x-www-form-urlencoded";
        public String ContentType
        {
            get { return _ContentType; }
            set { _ContentType = value; }
        }

        private CookieContainer _Cookies = new CookieContainer();
        public CookieContainer Cookies
        {
            get { return _Cookies; }
            set { _Cookies = value; }
        }

        private NetworkCredential _Credentials = null;
        public NetworkCredential Credentials
        {
            get { return _Credentials; }
            set { _Credentials = value; }
        }

        private String _UserAgent = "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1) Gecko/20061010 Firefox/2.0";
        public String UserAgent
        {
            get { return _UserAgent; }
            set { _UserAgent = value; }
        }

        private String _Referer = "http://google.com";
        public String Referer
        {
            get { return _Referer; }
            set { _Referer = value; }
        }

        private String _Accept = default(String);
        public String Accept
        {
            get { return _Accept; }
            set { _Accept = value; }
        }

        private Boolean _AllowAutoRedirect = true;
        public Boolean AllowAutoRedirect
        {
            get { return _AllowAutoRedirect; }
            set { _AllowAutoRedirect = value; }
        }

        private Boolean _KeepAlive = true;
        public Boolean KeepAlive
        {
            get { return _KeepAlive; }
            set { _KeepAlive = value; }
        }

        private Int32 _MaxRedirects = default(Int32);
        public Int32 MaxRedirects
        {
            get { return _MaxRedirects; }
            set { _MaxRedirects = value; }
        }

        private HttpProxyInfo _Proxy = null;
        public HttpProxyInfo Proxy
        {
            get { return _Proxy; }
            set { _Proxy = value; }
        }

        private Byte[] _RequestData = null;
        public Byte[] RequestData
        {
            get { return _RequestData; }
            set { _RequestData = value; }
        }

        private Int32 _TimeOut = default(Int32);
        public Int32 TimeOut
        {
            get { return _TimeOut; }
            set { _TimeOut = value; }
        }

        #endregion


        #region [ Constructors ]

        public HttpRequestInfo() { }
        public HttpRequestInfo(String url)
        {
            this.Url = url;
        }
        public HttpRequestInfo(String url, HttpProxyInfo proxy)
        {
            this.Url = url;
            this.Proxy = proxy;
        }
        public HttpRequestInfo(String url, CookieContainer cookies)
        {
            this.Url = url;
            this.Cookies = cookies;
        }
        public HttpRequestInfo(String url, CookieContainer cookies, HttpProxyInfo proxy)
        {
            this.Url = url;
            this.Cookies = cookies;
            this.Proxy = proxy;
        }
        public HttpRequestInfo(String url, Byte[] requestData)
        {
            this.Url = url;
            this.RequestData = requestData;
        }
        public HttpRequestInfo(String url, CookieContainer cookies, Byte[] requestData)
        {
            this.Url = url;
            this.Cookies = cookies;
            this.RequestData = requestData;
        }
        public HttpRequestInfo(String url, CookieContainer cookies, Byte[] requestData, HttpProxyInfo proxy)
        {
            this.Url = url;
            this.Cookies = cookies;
            this.RequestData = requestData;
            this.Proxy = proxy;
        }

        #endregion
    }

    #endregion
}


namespace LeroyNetSolutions.Net
{
    #region [ Using Directives ]

    using System;
    using System.Collections.Generic;
    using System.Text;

    #endregion


    #region [ HttpProxyInfo ]

    public class HttpProxyInfo
    {
        #region [ Properties ]

        private String _Url = default(String);
        public String Url
        {
            get { return _Url; }
            set { _Url = value; }
        }

        private Int32 _Port = 1080;
        public Int32 Port
        {
            get { return _Port; }
            set { _Port = value; }
        }

        private String _UserName = default(String);
        public String UserName
        {
            get { return _UserName; }
            set { _UserName = value; }
        }

        private String _Password = default(String);
        public String Password
        {
            get { return _Password; }
            set { _Password = value; }
        }

        #endregion


        #region [ Constructors ]

        public HttpProxyInfo() { }
        public HttpProxyInfo(String url)
        {
            this.Url = url;
        }
        public HttpProxyInfo(String url, Int32 port)
        {
            this.Url = url;
            this.Port = port;
        }
        public HttpProxyInfo(String url, Int32 port, String userName, String password)
        {
            this.Url = url;
            this.Port = port;
            this.UserName = userName;
            this.Password = password;
        }

        #endregion
    }

    #endregion
}

 

William Le Roi

Comments

Comments C# != C
Wed. Apr. 2nd, 2008 2:50 PM    Scripter sehrgut
Comments explanation?
Fri. Apr. 20th, 2007 12:21 AM    Beginner tech0
Comments Author
Sat. Apr. 21st, 2007 11:03 AM    Beginner leroi
Comments Use?
Wed. May. 16th, 2007 11:54 PM    Syntax Master sundaramkumar

Voting