Here is my module to parse HTML code, tag by tag.
It's specific to my needs, but you can alter it to suit specific tags. If you need help, don't hesitate to ask.
As a warning, I probably didn't put enough error handling in it (as I made it to parse wikipedia, which uses perfect HTML)
	
	
	
	
	
	
	
		
			
			
			
			
			
		
	
	
	
		
	
	
		
	
I may have left out a sub here or there, so don't hesitate to tell me. I have subscribed to this thread to get emails
			
			It's specific to my needs, but you can alter it to suit specific tags. If you need help, don't hesitate to ask.
As a warning, I probably didn't put enough error handling in it (as I made it to parse wikipedia, which uses perfect HTML)
			
				B4X:
			
		
		
		Sub Process_Globals
   'These global variables will be declared once when the application starts.
   'These variables can be accessed from all modules.
   Dim dURL As String ,Title As String ,FileLoaded As String ,BaseHref As String 
   
   Type HTMLvalue(Key As String, Value As String)
   Type HTMLtag(Level As Int, TagName As String, Node As String, Values As List)
   
End Sub
Sub ParseHTML(HTMLCode As String, URL As String)As String 
   Dim temp As Int,temp2 As Int, htag As String, tempstr As StringBuilder  ,Name As String ,temp3 As String, Node As String ', BaseHREF
   'tempstr="THIS IS A TEST OF THE LCARS WEB SYSTEM"
   Title=""
   BaseHREF=""
   tempstr.Initialize 
   
   
   Do Until temp >= htmlcode.Length OR temp<0
      Log(temp & "/" & htmlcode.Length)
      If mid(htmlcode, temp,1) = "<" Then
         temp2=htmlcode.IndexOf2(">", temp+1)
         htag=mid(htmlcode, temp,temp2-temp+1)
         temp=temp2+1
         
         Log("HTML: " & htag)
         Log("TAG: " & GetTagName(htag))
         
         Name=GetTagName(htag)
         Select Case Name.ToLowerCase 
            Case "a", "script", "title", "h1", "h2", "h3", "header","footer","style"
               'If Not( name.EqualsIgnoreCase("a") AND htag.Contains(" name=") ) Then
                  temp3 = htmlcode.IndexOf2("</" & name, temp2)
                  Node=mid(htmlcode, temp2+1,temp3-temp2-1).Replace(""", "'").Trim 
                  temp2=htmlcode.IndexOf2(">", temp3+1)
                  temp=temp2+1
                  Log("NODE2:" & node)
               'End If
         End Select
         
         Select Case Name.ToLowerCase.Replace("/", "")
            Case "base": BaseHREF=htag
            Case "title":    Title= node
            Case "img"
               'removed broken images
               tempstr.Append(CRLF & htag.Replace("<img", "<img onerror=" & Chr(34) & "this.style.display='none'" & Chr(34) ) )
            Case "h1", "h2", "h3", "h4"
               'tempstr= tempstr & htag & node & "</" & name & ">"
               tempstr.Append(CRLF & htag & node & "</" & name & ">")
            Case "a"':      tempstr= tempstr & MakeLCARbutton(lcar.LCAR_Orange, node)
               If node.Length>0 Then
                  If Not (node.ToLowerCase.Contains("img")) Then node=node.ToUpperCase 
                  node=MakeLCARbutton(lcar.LCAR_Orange, htag & node & "</A>")
'THIS IS THE PART THAT HANDLES URL, CHANGE THIS PART HERE TO SUIT YOUR NEEDS
               End If
               node=node.Replace(" href=" & Chr(34) & "#", " href=" & Chr(34) & ScrollTo(0) & "#")
               'tempstr= tempstr & node
               tempstr.Append(CRLF & node)
            Case "meta", "link", "!--", "script", "style", "body", "div", "span", "nav" , "input", "form", "ul", "li", "header","section","footer"  'ignore these tags
            Case Else
               'tempstr = tempstr & htag 
               tempstr.Append(CRLF & htag)
         End Select
         
      Else
         temp2=htmlcode.IndexOf2("<", temp+1)
         If temp2>-1 Then
            htag=mid(htmlcode, temp,temp2-temp).Trim
         Else
            temp2=htmlcode.Length 
            htag=right(htmlcode, temp2-temp).Trim 
         End If
         temp=temp2
         Select Case htag
            Case CRLF, "" 
            Case Else 
               'Log("NODE: " & htag)
               If CountAlphaNumericCharacters(htag) >0 Then 'tempstr=tempstr & CRLF & htag.Replace("•", "-")
                  tempstr.Append(CRLF & htag.Replace("•", "-"))
               End If
         End Select
      End If
      
   Loop
   
   If BaseHREF.Length=0 Then
      BaseHREF=left(url, url.LastIndexOf("/")+1)
      node="<BASE HREF='" & basehref & "'>"
   Else
      Msgbox("EMERGENCY","EMERGENCY")
   End If
   Return node & tempstr.ToString 
   'htmlcode.IndexOf2(
End Sub
Sub GetTagName(content As String) As String
    Dim temp As Long, temp2 As Long
    temp = InStr(content, " ",0)
    temp2 = InStr(content, ">",0)
    If temp > 0 AND temp < temp2 Then temp2 = temp
    Return Mid(content, 1, temp2 - 1)
End Sub
Sub CountAlphaNumericCharacters(Text As String)As Int
   Dim temp As Int, Count As Int,Character As Int 
   For temp = 0 To text.Length-1
      Character=Asc(mid(text,temp,1).ToLowerCase )
      If ( character >= Asc("a") AND character <= Asc("z") ) OR ( character >= Asc("0") AND character <= Asc("9")) Then count=count+1
   Next
   Return count
End Sub
Sub Instr(Text As String, TextToFind As String, Start As Int) As Int
   Return text.IndexOf2(texttofind,start)
End Sub
Sub Left(Text As String, Length As Long)As String 
   If length>text.Length Then length=text.Length 
   Return text.SubString2(0, length)
End Sub
Sub Right(Text As String, Length As Long) As String
   If length>text.Length Then length=text.Length 
   Return text.SubString(text.Length-length)
End Sub
Sub Mid(Text As String, Start As Int, Length As Int) As String 
   If Length>0 AND start>-1 AND start< text.Length Then Return text.SubString2(start,start+length)
End Sub
	I may have left out a sub here or there, so don't hesitate to tell me. I have subscribed to this thread to get emails
			
				Last edited: