Forum Moderators: open
If FileUpload1.FileName.EndsWith(".txt")
Dim myDirectoryInfo As DirectoryInfo = New DirectoryInfo("C:\Temp\")
Dim myFileInfo() As FileInfo = myDirectoryInfo.GetFiles("*.txt")
ListBox1.DataSource = myFileInfo
ListBox1.DataBind()
Create a small function which loops through a list of unaccepted (or accepted) extensions, if the given Url has an extension that is not accepted then flag it. Here is a small example
Private Function AcceptUrl(ByVal url As String) As Boolean
Dim result As Boolean = True
Dim ignoredExtensions As String = "pdf,doc,jpg,jpeg" ' ideally this would come from the web.config
Dim ignoredExtensionsList As String() = ignoredExtensions.Split(",")
For Each extension As String In ignoredExtensionsList
If url.EndsWith(extension) Then result = False
Next
Return result
End Function
You can call this with:
If AcceptUrl(myUrl) Then ListBox1.Items.Add(myUrl)
Your code works perfect with my lisbox but there is another problem with the crawler. When it crawls webpages is loading always unwanted webpages like .doc, .pdf ...
I will paste this code here to give me an idea if you want.
Maybe you can tell me where should i put your code here.
Many thanks Marcel!
---------------------------------------------------------------
'CrawlURL: Public interface
Public Sub CrawlURL(ByVal URL As String)
CrawlURL(URL, 0)
Do While m_qToDo.Count > 0
URL = m_qToDo.Peek
m_qToDo.Dequeue()
CrawlURL(URL, 0)
Loop
End Sub
'CrawlURL: Private Interface
Private Sub CrawlURL(ByVal URL As String, ByVal Level As Integer)
'Populate the page contents
If Not m_pgeCurrent.LoadSource(URL) Then
RaiseEvent PageComplete(URL, -1)
Exit Sub
End If
'Pass back the text on this page to the caller
Dim bNoFollow As Boolean = False
RaiseEvent NewPage(URL, m_pgeCurrent, Level, bNoFollow)
'Set the recursion level
Level = Level + 1
If Not bNoFollow Then
If Not m_pgeCurrent.NoFollow Then
'Grab the anchor tags on this page
Dim strHRefs() As String = m_pgeCurrent.GetHRefs()
'Continue if applicable
If Not strHRefs Is Nothing Then
'ShuffleURLs will screen out
'any already on the list, and shuffle
'the list - in case there is another
'process running simultaneously.
strHRefs = ShuffleURLs(m_pgeCurrent.Host, strHRefs)
'Crawl or save each HRef
If Not strHRefs Is Nothing Then
'Send the HRefs list back to the object owner
RaiseEvent Queuing(URL, Level, strHRefs)
Dim i As Integer
For i = 0 To UBound(strHRefs)
If m_bCancel Then
Exit Sub
End If
'Check the recursion level
If Level <= m_intMaxLevel Then
'Haven't exceeded level, contine
Me.CrawlURL(strHRefs(i), Level)
Else
'Maxed out here - flag it for later
m_qToDo.Enqueue(strHRefs(i))
End If
Next
End If
End If
End If
End If
Level = Level - 1
'Page complete
RaiseEvent PageComplete(URL, 0)
End Sub
Private Sub CrawlURL()
Dim strURL As String
strURL = CType(m_qToDo.Peek, String)
m_qToDo.Dequeue()
Me.CrawlURL(strURL)
End Sub
'ShuffleURLs: Build a list of URLs to crawl in a random order
Private Function ShuffleURLs(ByVal strDomainName As String, _
ByVal strHRefs() As String) As String()
'Weed the list of anything mal-formed or
'already on the list
Dim strReturn() As String
Dim i As Integer
Dim j As Integer
For i = 0 To strHRefs.Length - 1
If WeWannaCrawl(strDomainName, strHRefs(i)) Then
ReDim Preserve strReturn(j)
strReturn(j) = strHRefs(i)
j = j + 1
End If
Next
'Shuffle the list if there's anything left
If Not strReturn Is Nothing Then
Randomize()
Dim bAnchorsVisited(strReturn.Length - 1) As Boolean
Dim intAnchorIndexes(strReturn.Length - 1) As Integer
For i = 0 To strReturn.Length - 1
Dim intAnchorIndex As Integer _
= Int(((strReturn.Length - 1) + 1) * Rnd() + 0)
Do While bAnchorsVisited(intAnchorIndex)
intAnchorIndex = Int(((strReturn.Length - 1) + 1) * Rnd() + 0)
Loop
bAnchorsVisited(intAnchorIndex) = True
Dim strTemp = strReturn(i)
strReturn(i) = strReturn(intAnchorIndex)
strReturn(intAnchorIndex) = strTemp
Next i
End If
ShuffleURLs = strReturn
End Function
-----------------------------------------------------------------
WeWannaCrawl is just for checking if the domain name has at the begining the [<...]
If you're checking to see if the URL is valid before you crawl it, then you might as well add the code for checking the extension here as well.