Android Tutorial Parse HTML code

Here is my module to parse HTML code, tag by tag.
It's specific to my needs, but you can alter it to suit specific tags. If you need help, don't hesitate to ask.
As a warning, I probably didn't put enough error handling in it (as I made it to parse wikipedia, which uses perfect HTML)

B4X:
Sub Process_Globals
   'These global variables will be declared once when the application starts.
   'These variables can be accessed from all modules.
   Dim dURL As String ,Title As String ,FileLoaded As String ,BaseHref As String 
   
   Type HTMLvalue(Key As String, Value As String)
   Type HTMLtag(Level As Int, TagName As String, Node As String, Values As List)
   
End Sub



Sub ParseHTML(HTMLCode As String, URL As String)As String 
   Dim temp As Int,temp2 As Int, htag As String, tempstr As StringBuilder  ,Name As String ,temp3 As String, Node As String ', BaseHREF
   'tempstr="THIS IS A TEST OF THE LCARS WEB SYSTEM"
   Title=""
   BaseHREF=""
   tempstr.Initialize 
   
   
   Do Until temp >= htmlcode.Length OR temp<0
      Log(temp & "/" & htmlcode.Length)
      If mid(htmlcode, temp,1) = "<" Then
         temp2=htmlcode.IndexOf2(">", temp+1)
         htag=mid(htmlcode, temp,temp2-temp+1)
         temp=temp2+1
         
         Log("HTML: " & htag)
         Log("TAG: " & GetTagName(htag))
         
         Name=GetTagName(htag)
         Select Case Name.ToLowerCase 
            Case "a", "script", "title", "h1", "h2", "h3", "header","footer","style"
               'If Not( name.EqualsIgnoreCase("a") AND htag.Contains(" name=") ) Then
                  temp3 = htmlcode.IndexOf2("</" & name, temp2)
                  Node=mid(htmlcode, temp2+1,temp3-temp2-1).Replace("&quot;", "'").Trim 
                  temp2=htmlcode.IndexOf2(">", temp3+1)
                  temp=temp2+1
                  Log("NODE2:" & node)
               'End If
         End Select
         
         Select Case Name.ToLowerCase.Replace("/", "")
            Case "base": BaseHREF=htag
            Case "title":    Title= node
            Case "img"
               'removed broken images
               tempstr.Append(CRLF & htag.Replace("<img", "<img onerror=" & Chr(34) & "this.style.display='none'" & Chr(34) ) )
            Case "h1", "h2", "h3", "h4"
               'tempstr= tempstr & htag & node & "</" & name & ">"
               tempstr.Append(CRLF & htag & node & "</" & name & ">")
            Case "a"':      tempstr= tempstr & MakeLCARbutton(lcar.LCAR_Orange, node)
               If node.Length>0 Then
                  If Not (node.ToLowerCase.Contains("img")) Then node=node.ToUpperCase 
                  node=MakeLCARbutton(lcar.LCAR_Orange, htag & node & "</A>")
'THIS IS THE PART THAT HANDLES URL, CHANGE THIS PART HERE TO SUIT YOUR NEEDS

               End If
               node=node.Replace(" href=" & Chr(34) & "#", " href=" & Chr(34) & ScrollTo(0) & "#")
               'tempstr= tempstr & node
               tempstr.Append(CRLF & node)
            Case "meta", "link", "!--", "script", "style", "body", "div", "span", "nav" , "input", "form", "ul", "li", "header","section","footer"  'ignore these tags
            Case Else
               'tempstr = tempstr & htag 
               tempstr.Append(CRLF & htag)
         End Select
         
      Else
         temp2=htmlcode.IndexOf2("<", temp+1)
         If temp2>-1 Then
            htag=mid(htmlcode, temp,temp2-temp).Trim
         Else
            temp2=htmlcode.Length 
            htag=right(htmlcode, temp2-temp).Trim 
         End If
         temp=temp2
         Select Case htag
            Case CRLF, "" 
            Case Else 
               'Log("NODE: " & htag)
               If CountAlphaNumericCharacters(htag) >0 Then 'tempstr=tempstr & CRLF & htag.Replace("•", "-")
                  tempstr.Append(CRLF & htag.Replace("•", "-"))
               End If
         End Select
      End If
      
   Loop
   
   If BaseHREF.Length=0 Then
      BaseHREF=left(url, url.LastIndexOf("/")+1)
      node="<BASE HREF='" & basehref & "'>"
   Else
      Msgbox("EMERGENCY","EMERGENCY")
   End If
   Return node & tempstr.ToString 
   'htmlcode.IndexOf2(
End Sub


Sub GetTagName(content As String) As String
    Dim temp As Long, temp2 As Long
    temp = InStr(content, " ",0)
    temp2 = InStr(content, ">",0)
    If temp > 0 AND temp < temp2 Then temp2 = temp
    Return Mid(content, 1, temp2 - 1)
End Sub

Sub CountAlphaNumericCharacters(Text As String)As Int
   Dim temp As Int, Count As Int,Character As Int 
   For temp = 0 To text.Length-1
      Character=Asc(mid(text,temp,1).ToLowerCase )
      If ( character >= Asc("a") AND character <= Asc("z") ) OR ( character >= Asc("0") AND character <= Asc("9")) Then count=count+1
   Next
   Return count
End Sub




Sub Instr(Text As String, TextToFind As String, Start As Int) As Int
   Return text.IndexOf2(texttofind,start)
End Sub
Sub Left(Text As String, Length As Long)As String 
   If length>text.Length Then length=text.Length 
   Return text.SubString2(0, length)
End Sub
Sub Right(Text As String, Length As Long) As String
   If length>text.Length Then length=text.Length 
   Return text.SubString(text.Length-length)
End Sub
Sub Mid(Text As String, Start As Int, Length As Int) As String 
   If Length>0 AND start>-1 AND start< text.Length Then Return text.SubString2(start,start+length)
End Sub

I may have left out a sub here or there, so don't hesitate to tell me. I have subscribed to this thread to get emails
 
Last edited:

urikupfer

Member
Licensed User
Longtime User
Hi
For parsing html you can use the FromHtml(html string) function from Martin's labelextra library, it can parse many html tags.
uri
 

mohdsaks

Member
Licensed User
Longtime User
hi i like what u did really i need this code put am new with b4a my question is if i have a html page let say www.mypage.com/page1.htm and i want to get value from it let say the value is temperature for the room how i can do this with your code please help ??
 

Massy

Member
Licensed User
Longtime User
Hi, I'm trying to use your work but I get two errors... could you please help me understand why?

First error is in the line:
Node=MakeLCARbutton(lcar.LCAR_Orange, htag & Node & "</A>")
error says "Undeclared variable 'makelcarbutton' is used before it was assigned any value"
as it says "button" I tryed to add a button in designer named MakeLCARbutton but error changes in "Use of undeclared array: makelcarbutton"...

Second error is some lines down in:
Node=Node.Replace(" href=" & Chr(34) & "#", " href=" & Chr(34) & ScrollTo(0) & "#")
and it says "Undeclared variable 'scrollto' is used before it was assigned any value"

Thanks for your help
Massy
 

NeoTechni

Well-Known Member
Licensed User
Longtime User
MakeLCARbutton just converts an A HREF tag into HTML for a button in my app.
ScrollTo just returns the currently loaded HTML filename
Both can be swapped out for your own code
 
Top