Code ausführen
<a href="link-ex-code.php">Code anzeigen</a>
<?php
class CDWHttpFile
{
/* $strLocation - URL of the last web page retreived (could be different
from what was requiested in case of HTTP redirect.) */
var $strLocation;
var $aHeaderLines; // headers of last web page
var $strFile; // last web page retreived
/* $bResult - contains true if last web page was
retrieved successfully, false otherwise. */
var $bResult;
/* ReadHttpFile - the function that does all the work.
$strUrl - URL of the page we want to get.
$iHttpRedirectMaxRecursiveCalls - maximum number of
times following HTTP redirection. */
function ReadHttpFile($strUrl, $iHttpRedirectMaxRecursiveCalls = 20)
{
// parsing the url getting web server name/IP, path and port.
$url = parse_url($strUrl);
// setting path to "/" if not present in $strUrl
if (isset($url["path"]) == false) $url["path"] = "/";
// setting port to default HTTP server port 80
if (isset($url["port"]) == false) $url["port"] = 80;
// connecting to the server
$fp = fsockopen ($url["host"], $url["port"], $errno, $errstr, 30);
// reseting class data
$this->bResult = false;
unset($this->strFile);
unset($this->aHeaderLines);
$this->strLocation = $strUrl;
/* Return if the socket was not open $this->bResult is set to false. */
if (!$fp)
return;
else
{
// composing HTTP request
$strQuery = "GET ".$url["path"];
if (isset($url["query"]) == true) $strQuery .= "?".$url["query"];
$strQuery .= " HTTP/1.0\r\n\r\n";
// sending the request to the server
fputs($fp, $strQuery);
/* $bHeader is set to true while we receive the HTTP header
and after the empty line (end of HTTP header) it's set to false. */
$bHeader = true;
// continuing untill there's no more text to read from the socket
while (!feof($fp))
{
/* reading a line of text from the socket
not more than 8192 symbols. */
$strLine = fgets($fp, 8192);
// removing trailing \n and \r characters.
$strLine = ereg_replace("[\r\n]", "", $strLine);
if ($bHeader == false)
$this->strFile .= $strLine."\n";
else
$this->aHeaderLines[] = trim($strLine);
if (strlen($strLine) == 0) $bHeader = false;
}
fclose ($fp);
}
/* Processing all HTTP header lines and checking for
HTTP redirect directive 'Location:'. */
for ($i = 0; $i < count($this->aHeaderLines); $i++)
if (strcasecmp(substr($this->aHeaderLines[$i], 0, 9), "Location:") == 0)
{
$url = trim(substr($this->aHeaderLines[$i], 9));
// $url now is the URL of the web page we are relocated to
// If $url is the same page we are requesting, just continue
if ($url != $strUrl)
{
/* If the maximum number of redirects is reached,
just return. $this->bResult is set to false. */
if ($iHttpRedirectMaxRecursiveCalls == 0) return;
/* Calling the function recursively with the new URL
and the maximum number of redirections reduced by one. */
return $this->ReadHttpFile(
$url,
$iHttpRedirectMaxRecursiveCalls-1);
}
}
/* We should get here if there was no HTTP redirect directive found.
Setting $this->bResult to true. Web page was retreived successfully. */
$this->bResult = true;
/* If magic_quotes_runtime is enabled in php.ini, then all the quotes
in the received text will be prefixed with slashes. */
if (ini_get("magic_quotes_runtime"))
{
$this->strFile = stripslashes($this->strFile);
for ($i = 0; $i < count($this->aHeaderLines); $i++)
$this->aHeaderLines[$i] = stripslashes($this->aHeaderLines[$i]);
}
}
/* Just to make it easier to use this class, adding contructor
which accepts URL as a parameter and calls ReadHttpFile functions. */
function CDWHttpFile($strUrl)
{
if (strlen($strUrl) > 0)
$this->ReadHttpFile($strUrl);
}
};
class CLinkScanner
{
var $aUrlsToProcess;
/* $aUrlsToProcess is associative array of url's not yet scanned for links.
If $link is to be processed, $aUrlsToProcess[$link] = true */
var $aProcessedUrls;
/* $aProcessedUrls is associative array of url's already scanned for links.
If $url is already processed, $aProcessedUrls[$url] = true */
var $strSiteBaseUrl;
/* Algorithm won't process url's which don't begin with $strSiteBaseUrl. */
/*
Function RetrieveLinks scans $strText for links.
If new links are found, they are added to $aUrlsToProcess.
*/
function RetrieveLinks($strPageText, $strBaseUrl)
{
preg_match_all(
"/<A[ \r\n\t]{1}[^>]*HREF[^=]*=[ '\"\n\r\t]*([^ \"'>\r\n\t#]+)[ \"'>\r\n\t#>][^>]*>/isU",
$strPageText,
$aUrls);
foreach($aUrls[1] as $strUrl)
{
trim($strUrl);
// skipping email addresses
if (substr($strUrl, 0, 7) == "mailto:") continue;
// skipping javascript code
if (substr($strUrl, 0, 11) == "javascript:") continue;
// if $strUrl is not in the canonical form, adding current web page url
if (substr($strUrl, 0, 7) != "http://")
{
if ($strBaseUrl[strlen($strBaseUrl)-1] != '/' && $strUrl[0] != '/')
$strUrl = $strBaseUrl.'/'.$strUrl;
else
$strUrl = $strBaseUrl.$strUrl;
}
/* If $strUrl points outside of web site, skip it. */
if (strlen($strUrl) < strlen($this->strSiteBaseUrl) ||
substr($strUrl, 0, strlen($this->strSiteBaseUrl)) !=
$this->strSiteBaseUrl) continue;
/* If web page $strUrl is now scanned for links, adding
it to the list of not yet processed url's. */
if (isset($this->aProcessedUrls[$strUrl]) == false)
$this->aUrlsToProcess[$strUrl] = true;
}
}
/* Now, creating a function which will repeatly call
RetrieveLinks until the list of url's to be processed is empty. */
function Start()
{
do
{
// getting first URL from the list of url's to be processed
reset($this->aUrlsToProcess);
$strUrl = key($this->aUrlsToProcess);
// removing that URL from the list of url's to be processed
unset($this->aUrlsToProcess[$strUrl]);
// adding that URL to the list of already processed url's
$this->aProcessedUrls[$strUrl] = true;
/* Here using CDWHttpFile class to retreive the web page with url $strUrl.
You can see CDWHttpFile source code in the article
Retreiving web page contents handling HTTP redirects.*/
$httpFile = new CDWHttpFile($strUrl);
if ($httpFile->bResult == true) // if the web page is retrieved
{
/* In case if we got to another URL because of HTTP redirect,
adding new url to the list of processed URL's, and removing it
(if it exists there) from the list of URL's to be processed. */
$strUrl = $httpFile->strLocation;
$this->aProcessedUrls[$strUrl] = true;
unset($this->aUrlsToProcess[$strUrl]);
// Finally, retreiving links
$this->RetrieveLinks($httpFile->strFile, $httpFile->strLocation);
}
// Repeating untill the list of URL's to be processed is empty.
} while (count($this->aUrlsToProcess) != 0);
}
/* Finishing up, writing a function which will start the whole process. */
function Process($strBaseUrl, $strEntryUrl) // starting from $strUrl
{
$this->strSiteBaseUrl = $strBaseUrl;
// Adding entry point to the list of URL's to be processed.
$this->aUrlsToProcess[$strUrl] = true;
$this->Start(); // Starting the link retrieval process.
}
};
$linkScanner = new CLinkScanner();
$linkScanner->Process("http://www.digiways.com", "http://www.digiways.com/articles/");
foreach($linkScanner->aProcessedUrls as $strUrl => $bTrue) echo $strUrl."<br>";
?>