Forum Moderators: coopster
what i need it to do is extract all links (URLs), words (as one large string snapshot of the page, and then an array of each individual word), and META tags...
is there a good script/code available online somewhere which works?
gotta question though, is there any way to 'resolve' links?
a lot of sites only link to other files, how do you tack on the domain/directory info?
(i.e, [microsoft.com...] links to "../wow.htm"....how do i turn that into [microsoft.com...]
<?php
// Head, Tail are pointers into the entire document at Text
function ParseHTML ($Text, &$Head, &$Tail, &$Attr)
{
$Chunk = substr ($Text, $Head);
$ChunkLen = strlen ($Chunk);
if ($Chunk [0] == '<') {
if (($Chunk [1] == '!') &&
($Chunk [2] == '-') &&
($Chunk [3] == '-')) {
$x = strpos ($Chunk, "-->");
if ($x > 0) {
$x += 2;
$Tail = $Head + $x;
$Attr = ATTR_COMMENT;
} else {
$Tail = $Head + $ChunkLen;
$Attr = ATTR_COMMENT;
}
} else if (strncasecmp ($Chunk, "<script", 7) == 0) {
$x = stripos ($Chunk, "</script>");
if ($x > 0) {
$x += 8;
$Tail = $Head + $x;
$Attr = ATTR_SCRIPT;
} else {
$Tail = $Head + $ChunkLen;
$Attr = ATTR_SCRIPT;
}
} else {
// catch all other tags!
$x = strpos ($Chunk, ">");
if ($x > 0) {
$Tail = $Head + $x;
$Attr = ATTR_TAG;
} else {
$Tail = $Head + $ChunkLen;
$Attr = ATTR_TAG;
}
}
} else {
$x = strpos ($Chunk, "<");
if ($x > 0) {
$Tail = $Head + $x - 1;
$Attr = ATTR_TEXT;
} else {
$Tail = $Head + $ChunkLen;
$Attr = ATTR_TEXT;
}
}
}
?>