Here is my module to parse HTML code, tag by tag.
It's specific to my needs, but you can alter it to suit specific tags. If you need help, don't hesitate to ask.
As a warning, I probably didn't put enough error handling in it (as I made it to parse wikipedia, which uses perfect HTML)
I may have left out a sub here or there, so don't hesitate to tell me. I have subscribed to this thread to get emails
It's specific to my needs, but you can alter it to suit specific tags. If you need help, don't hesitate to ask.
As a warning, I probably didn't put enough error handling in it (as I made it to parse wikipedia, which uses perfect HTML)
B4X:
Sub Process_Globals
'These global variables will be declared once when the application starts.
'These variables can be accessed from all modules.
Dim dURL As String ,Title As String ,FileLoaded As String ,BaseHref As String
Type HTMLvalue(Key As String, Value As String)
Type HTMLtag(Level As Int, TagName As String, Node As String, Values As List)
End Sub
Sub ParseHTML(HTMLCode As String, URL As String)As String
Dim temp As Int,temp2 As Int, htag As String, tempstr As StringBuilder ,Name As String ,temp3 As String, Node As String ', BaseHREF
'tempstr="THIS IS A TEST OF THE LCARS WEB SYSTEM"
Title=""
BaseHREF=""
tempstr.Initialize
Do Until temp >= htmlcode.Length OR temp<0
Log(temp & "/" & htmlcode.Length)
If mid(htmlcode, temp,1) = "<" Then
temp2=htmlcode.IndexOf2(">", temp+1)
htag=mid(htmlcode, temp,temp2-temp+1)
temp=temp2+1
Log("HTML: " & htag)
Log("TAG: " & GetTagName(htag))
Name=GetTagName(htag)
Select Case Name.ToLowerCase
Case "a", "script", "title", "h1", "h2", "h3", "header","footer","style"
'If Not( name.EqualsIgnoreCase("a") AND htag.Contains(" name=") ) Then
temp3 = htmlcode.IndexOf2("</" & name, temp2)
Node=mid(htmlcode, temp2+1,temp3-temp2-1).Replace(""", "'").Trim
temp2=htmlcode.IndexOf2(">", temp3+1)
temp=temp2+1
Log("NODE2:" & node)
'End If
End Select
Select Case Name.ToLowerCase.Replace("/", "")
Case "base": BaseHREF=htag
Case "title": Title= node
Case "img"
'removed broken images
tempstr.Append(CRLF & htag.Replace("<img", "<img onerror=" & Chr(34) & "this.style.display='none'" & Chr(34) ) )
Case "h1", "h2", "h3", "h4"
'tempstr= tempstr & htag & node & "</" & name & ">"
tempstr.Append(CRLF & htag & node & "</" & name & ">")
Case "a"': tempstr= tempstr & MakeLCARbutton(lcar.LCAR_Orange, node)
If node.Length>0 Then
If Not (node.ToLowerCase.Contains("img")) Then node=node.ToUpperCase
node=MakeLCARbutton(lcar.LCAR_Orange, htag & node & "</A>")
'THIS IS THE PART THAT HANDLES URL, CHANGE THIS PART HERE TO SUIT YOUR NEEDS
End If
node=node.Replace(" href=" & Chr(34) & "#", " href=" & Chr(34) & ScrollTo(0) & "#")
'tempstr= tempstr & node
tempstr.Append(CRLF & node)
Case "meta", "link", "!--", "script", "style", "body", "div", "span", "nav" , "input", "form", "ul", "li", "header","section","footer" 'ignore these tags
Case Else
'tempstr = tempstr & htag
tempstr.Append(CRLF & htag)
End Select
Else
temp2=htmlcode.IndexOf2("<", temp+1)
If temp2>-1 Then
htag=mid(htmlcode, temp,temp2-temp).Trim
Else
temp2=htmlcode.Length
htag=right(htmlcode, temp2-temp).Trim
End If
temp=temp2
Select Case htag
Case CRLF, ""
Case Else
'Log("NODE: " & htag)
If CountAlphaNumericCharacters(htag) >0 Then 'tempstr=tempstr & CRLF & htag.Replace("•", "-")
tempstr.Append(CRLF & htag.Replace("•", "-"))
End If
End Select
End If
Loop
If BaseHREF.Length=0 Then
BaseHREF=left(url, url.LastIndexOf("/")+1)
node="<BASE HREF='" & basehref & "'>"
Else
Msgbox("EMERGENCY","EMERGENCY")
End If
Return node & tempstr.ToString
'htmlcode.IndexOf2(
End Sub
Sub GetTagName(content As String) As String
Dim temp As Long, temp2 As Long
temp = InStr(content, " ",0)
temp2 = InStr(content, ">",0)
If temp > 0 AND temp < temp2 Then temp2 = temp
Return Mid(content, 1, temp2 - 1)
End Sub
Sub CountAlphaNumericCharacters(Text As String)As Int
Dim temp As Int, Count As Int,Character As Int
For temp = 0 To text.Length-1
Character=Asc(mid(text,temp,1).ToLowerCase )
If ( character >= Asc("a") AND character <= Asc("z") ) OR ( character >= Asc("0") AND character <= Asc("9")) Then count=count+1
Next
Return count
End Sub
Sub Instr(Text As String, TextToFind As String, Start As Int) As Int
Return text.IndexOf2(texttofind,start)
End Sub
Sub Left(Text As String, Length As Long)As String
If length>text.Length Then length=text.Length
Return text.SubString2(0, length)
End Sub
Sub Right(Text As String, Length As Long) As String
If length>text.Length Then length=text.Length
Return text.SubString(text.Length-length)
End Sub
Sub Mid(Text As String, Start As Int, Length As Int) As String
If Length>0 AND start>-1 AND start< text.Length Then Return text.SubString2(start,start+length)
End Sub
I may have left out a sub here or there, so don't hesitate to tell me. I have subscribed to this thread to get emails
Last edited: