<a href="link-ex.php">Code ausf&uuml;hren</a><BR><BR>
<pre>
&lt;a href=&quot;link-ex-code.php&quot;&gt;Code anzeigen&lt;/a&gt;
&lt;?php

class CDWHttpFile
{
    /* $strLocation - URL of the last web page retreived (could be different
        from what was requiested in case of HTTP redirect.) */
    var $strLocation;
    var $aHeaderLines; // headers of last web page
    var $strFile; // last web page retreived
    /* $bResult - contains true if last web page was
        retrieved successfully, false otherwise. */
    var $bResult;

    /* ReadHttpFile - the function that does all the work.
        $strUrl - URL of the page we want to get.
        $iHttpRedirectMaxRecursiveCalls - maximum number of
        times following HTTP redirection. */        
    function ReadHttpFile($strUrl, $iHttpRedirectMaxRecursiveCalls = 20)
    {
        // parsing the url getting web server name/IP, path and port.
        $url = parse_url($strUrl);
        // setting path to &quot;/&quot; if not present in $strUrl
        if (isset($url[&quot;path&quot;]) == false) $url[&quot;path&quot;] = &quot;/&quot;;
        // setting port to default HTTP server port 80
        if (isset($url[&quot;port&quot;]) == false) $url[&quot;port&quot;] = 80;
        // connecting to the server
        $fp = fsockopen ($url[&quot;host&quot;], $url[&quot;port&quot;], $errno, $errstr, 30);


        // reseting class data        
        $this-&gt;bResult = false;
        unset($this-&gt;strFile);
        unset($this-&gt;aHeaderLines);
        $this-&gt;strLocation = $strUrl;

        /* Return if the socket was not open $this-&gt;bResult is set to false. */
        if (!$fp)
            return;
        else
        {
            // composing HTTP request
            $strQuery = &quot;GET &quot;.$url[&quot;path&quot;];
            if (isset($url[&quot;query&quot;]) == true) $strQuery .= &quot;?&quot;.$url[&quot;query&quot;];
            $strQuery .= &quot; HTTP/1.0\r\n\r\n&quot;;
            // sending the request to the server
            fputs($fp, $strQuery);
            /* $bHeader is set to true while we receive the HTTP header
            and after the empty line (end of HTTP header) it's set to false. */
            $bHeader = true;
            // continuing untill there's no more text to read from the socket
            while (!feof($fp))
            {
                /* reading a line of text from the socket
                    not more than 8192 symbols. */
                $strLine = fgets($fp, 8192);
                // removing trailing \n and \r characters.
                $strLine = ereg_replace(&quot;[\r\n]&quot;, &quot;&quot;, $strLine);
                if ($bHeader == false)
                    $this-&gt;strFile .= $strLine.&quot;\n&quot;;
                else
                    $this-&gt;aHeaderLines[] = trim($strLine);
                if (strlen($strLine) == 0) $bHeader = false;
            }
            fclose ($fp);
        }

        /* Processing all HTTP header lines and checking for
            HTTP redirect directive 'Location:'. */
        for ($i = 0; $i &lt; count($this-&gt;aHeaderLines); $i++)
            if (strcasecmp(substr($this-&gt;aHeaderLines[$i], 0, 9), &quot;Location:&quot;) == 0)
            {
                $url = trim(substr($this-&gt;aHeaderLines[$i], 9));
                // $url now is the URL of the web page we are relocated to
                // If $url is the same page we are requesting, just continue
                if ($url != $strUrl)
                {
                    /* If the maximum number of redirects is reached,
                        just return. $this-&gt;bResult is set to false. */
                    if ($iHttpRedirectMaxRecursiveCalls == 0) return;
                    /* Calling the function recursively with the new URL
                    and the maximum number of redirections reduced by one. */
                    return $this-&gt;ReadHttpFile(
                                $url,
                                $iHttpRedirectMaxRecursiveCalls-1);
                }
            }

        /* We should get here if there was no HTTP redirect directive found.
            Setting $this-&gt;bResult to true. Web page was retreived successfully. */
        $this-&gt;bResult = true;
        
        /* If magic_quotes_runtime is enabled in php.ini, then all the quotes
            in the received text will be prefixed with slashes. */
        if (ini_get(&quot;magic_quotes_runtime&quot;))
        {
            $this-&gt;strFile = stripslashes($this-&gt;strFile);
            for ($i = 0; $i &lt; count($this-&gt;aHeaderLines); $i++)
                $this-&gt;aHeaderLines[$i] = stripslashes($this-&gt;aHeaderLines[$i]);
        }
    }

    /* Just to make it easier to use this class, adding contructor
        which accepts URL as a parameter and calls ReadHttpFile functions. */
    function CDWHttpFile($strUrl)
    {
        if (strlen($strUrl) &gt; 0)
            $this-&gt;ReadHttpFile($strUrl);
    }
};


class CLinkScanner
{
    var $aUrlsToProcess;
    /* $aUrlsToProcess is associative array of url's not yet scanned for links.
        If $link is to be processed, $aUrlsToProcess[$link] = true */
    var $aProcessedUrls;
    /* $aProcessedUrls is associative array of url's already scanned for links.
        If $url is already processed, $aProcessedUrls[$url] = true */
    var $strSiteBaseUrl;
    /* Algorithm won't process url's which don't begin with $strSiteBaseUrl. */

    /*
        Function RetrieveLinks scans $strText for links.
        If new links are found, they are added to $aUrlsToProcess.
    */
    function RetrieveLinks($strPageText, $strBaseUrl)
    {
        preg_match_all(
            &quot;/&lt;A[ \r\n\t]{1}[^&gt;]*HREF[^=]*=[ '\&quot;\n\r\t]*([^ \&quot;'&gt;\r\n\t#]+)[ \&quot;'&gt;\r\n\t#&gt;][^&gt;]*&gt;/isU&quot;,
            $strPageText,
            $aUrls);
        foreach($aUrls[1] as $strUrl)
        {
            trim($strUrl);
            // skipping email addresses
            if (substr($strUrl, 0, 7) == &quot;mailto:&quot;) continue;
            // skipping javascript code
            if (substr($strUrl, 0, 11) == &quot;javascript:&quot;) continue;
            // if $strUrl is not in the canonical form, adding current web page url
            if (substr($strUrl, 0, 7) != &quot;http://&quot;)
            {
                if ($strBaseUrl[strlen($strBaseUrl)-1] != '/' &amp;&amp; $strUrl[0] != '/')
                    $strUrl = $strBaseUrl.'/'.$strUrl;
                else
                    $strUrl = $strBaseUrl.$strUrl;
            }
            /* If $strUrl points outside of web site, skip it. */
            if (strlen($strUrl) &lt; strlen($this-&gt;strSiteBaseUrl) ||
                substr($strUrl, 0, strlen($this-&gt;strSiteBaseUrl)) !=
                    $this-&gt;strSiteBaseUrl) continue;

            /* If web page $strUrl is now scanned for links, adding
                it to the list of not yet processed url's. */
            if (isset($this-&gt;aProcessedUrls[$strUrl]) == false)
                $this-&gt;aUrlsToProcess[$strUrl] = true;
        }
    }


    /* Now, creating a function which will repeatly call
        RetrieveLinks until the list of url's to be processed is empty. */
    function Start()
    {
        do
        {
            // getting first URL from the list of url's to be processed
            reset($this-&gt;aUrlsToProcess);
            $strUrl = key($this-&gt;aUrlsToProcess);
            // removing that URL from the list of url's to be processed
            unset($this-&gt;aUrlsToProcess[$strUrl]);
            // adding that URL to the list of already processed url's
            $this-&gt;aProcessedUrls[$strUrl] = true;

            /* Here using CDWHttpFile class to retreive the web page with url $strUrl.
                You can see CDWHttpFile source code in the article
                 Retreiving web page contents handling HTTP redirects.*/
            $httpFile = new CDWHttpFile($strUrl);
            if ($httpFile-&gt;bResult == true) // if the web page is retrieved
            {
                /* In case if we got to another URL because of HTTP redirect,
                adding new url to the list of processed URL's, and removing it
                (if it exists there) from the list of URL's to be processed. */
                $strUrl = $httpFile-&gt;strLocation;
                $this-&gt;aProcessedUrls[$strUrl] = true;
                unset($this-&gt;aUrlsToProcess[$strUrl]);
                // Finally, retreiving links
                $this-&gt;RetrieveLinks($httpFile-&gt;strFile, $httpFile-&gt;strLocation);
            }
        // Repeating untill the list of URL's to be processed is empty.
        } while (count($this-&gt;aUrlsToProcess) != 0);
    }
    
    /* Finishing up, writing a function which will start the whole process. */
    function Process($strBaseUrl, $strEntryUrl) // starting from $strUrl
    {
        $this-&gt;strSiteBaseUrl = $strBaseUrl;
        // Adding entry point to the list of URL's to be processed.
        $this-&gt;aUrlsToProcess[$strUrl] = true;
        $this-&gt;Start(); // Starting the link retrieval process.
    }
};

    $linkScanner = new CLinkScanner();
    $linkScanner-&gt;Process(&quot;http://www.digiways.com&quot;, &quot;http://www.digiways.com/articles/&quot;);
    foreach($linkScanner-&gt;aProcessedUrls as $strUrl =&gt; $bTrue) echo $strUrl.&quot;&lt;br&gt;&quot;;

?&gt;</pre>