Forum Moderators: martinibuster
$domain = 'example.com'; // Domain you're interested in
$dom = new DOMDocument; // Use DOM Document class
$dom->loadHTML('htmlfile.htm'); // Load HTML up, you didn't mention whether you have the pages fetched yet
$elements = $dom->getElementsByTagName('a'); // Get all <a> tags
// Iterate through <a> tags
foreach($elements as $e) {
if($e->hasAttribute('href')) { // Only consider <a> tags with an href attribute
$href = $e->getAttribute('href');
$parsed = parse_url($href); // Parse URL into its components
if(!isset($parsed['host'])) // Ignore relative URLs, badly formed URLs, javascript etc
continue;
if(preg_match("'(^|\.)$domain$'i",$parsed['host'])) // Match the domain and any subdomain
echo $href,"\t",$e->nodeValue,"\n"; // echo the href and anchor text
}
}