Android Tutorial Parse HTML code

Here is my module to parse HTML code, tag by tag.
It's specific to my needs, but you can alter it to suit specific tags. If you need help, don't hesitate to ask.
As a warning, I probably didn't put enough error handling in it (as I made it to parse wikipedia, which uses perfect HTML)

B4X:
Sub Process_Globals
   'These global variables will be declared once when the application starts.
   'These variables can be accessed from all modules.
   Dim dURL As String ,Title As String ,FileLoaded As String ,BaseHref As String 
   
   Type HTMLvalue(Key As String, Value As String)
   Type HTMLtag(Level As Int, TagName As String, Node As String, Values As List)
   
End Sub



Sub ParseHTML(HTMLCode As String, URL As String)As String 
   Dim temp As Int,temp2 As Int, htag As String, tempstr As StringBuilder  ,Name As String ,temp3 As String, Node As String ', BaseHREF
   'tempstr="THIS IS A TEST OF THE LCARS WEB SYSTEM"
   Title=""
   BaseHREF=""
   tempstr.Initialize 
   
   
   Do Until temp >= htmlcode.Length OR temp<0
      Log(temp & "/" & htmlcode.Length)
      If mid(htmlcode, temp,1) = "<" Then
         temp2=htmlcode.IndexOf2(">", temp+1)
         htag=mid(htmlcode, temp,temp2-temp+1)
         temp=temp2+1
         
         Log("HTML: " & htag)
         Log("TAG: " & GetTagName(htag))
         
         Name=GetTagName(htag)
         Select Case Name.ToLowerCase 
            Case "a", "script", "title", "h1", "h2", "h3", "header","footer","style"
               'If Not( name.EqualsIgnoreCase("a") AND htag.Contains(" name=") ) Then
                  temp3 = htmlcode.IndexOf2("</" & name, temp2)
                  Node=mid(htmlcode, temp2+1,temp3-temp2-1).Replace("&quot;", "'").Trim 
                  temp2=htmlcode.IndexOf2(">", temp3+1)
                  temp=temp2+1
                  Log("NODE2:" & node)
               'End If
         End Select
         
         Select Case Name.ToLowerCase.Replace("/", "")
            Case "base": BaseHREF=htag
            Case "title":    Title= node
            Case "img"
               'removed broken images
               tempstr.Append(CRLF & htag.Replace("<img", "<img onerror=" & Chr(34) & "this.style.display='none'" & Chr(34) ) )
            Case "h1", "h2", "h3", "h4"
               'tempstr= tempstr & htag & node & "</" & name & ">"
               tempstr.Append(CRLF & htag & node & "</" & name & ">")
            Case "a"':      tempstr= tempstr & MakeLCARbutton(lcar.LCAR_Orange, node)
               If node.Length>0 Then
                  If Not (node.ToLowerCase.Contains("img")) Then node=node.ToUpperCase 
                  node=MakeLCARbutton(lcar.LCAR_Orange, htag & node & "</A>")
'THIS IS THE PART THAT HANDLES URL, CHANGE THIS PART HERE TO SUIT YOUR NEEDS

               End If
               node=node.Replace(" href=" & Chr(34) & "#", " href=" & Chr(34) & ScrollTo(0) & "#")
               'tempstr= tempstr & node
               tempstr.Append(CRLF & node)
            Case "meta", "link", "!--", "script", "style", "body", "div", "span", "nav" , "input", "form", "ul", "li", "header","section","footer"  'ignore these tags
            Case Else
               'tempstr = tempstr & htag 
               tempstr.Append(CRLF & htag)
         End Select
         
      Else
         temp2=htmlcode.IndexOf2("<", temp+1)
         If temp2>-1 Then
            htag=mid(htmlcode, temp,temp2-temp).Trim
         Else
            temp2=htmlcode.Length 
            htag=right(htmlcode, temp2-temp).Trim 
         End If
         temp=temp2
         Select Case htag
            Case CRLF, "" 
            Case Else 
               'Log("NODE: " & htag)
               If CountAlphaNumericCharacters(htag) >0 Then 'tempstr=tempstr & CRLF & htag.Replace("•", "-")
                  tempstr.Append(CRLF & htag.Replace("•", "-"))
               End If
         End Select
      End If
      
   Loop
   
   If BaseHREF.Length=0 Then
      BaseHREF=left(url, url.LastIndexOf("/")+1)
      node="<BASE HREF='" & basehref & "'>"
   Else
      Msgbox("EMERGENCY","EMERGENCY")
   End If
   Return node & tempstr.ToString 
   'htmlcode.IndexOf2(
End Sub


Sub GetTagName(content As String) As String
    Dim temp As Long, temp2 As Long
    temp = InStr(content, " ",0)
    temp2 = InStr(content, ">",0)
    If temp > 0 AND temp < temp2 Then temp2 = temp
    Return Mid(content, 1, temp2 - 1)
End Sub

Sub CountAlphaNumericCharacters(Text As String)As Int
   Dim temp As Int, Count As Int,Character As Int 
   For temp = 0 To text.Length-1
      Character=Asc(mid(text,temp,1).ToLowerCase )
      If ( character >= Asc("a") AND character <= Asc("z") ) OR ( character >= Asc("0") AND character <= Asc("9")) Then count=count+1
   Next
   Return count
End Sub




Sub Instr(Text As String, TextToFind As String, Start As Int) As Int
   Return text.IndexOf2(texttofind,start)
End Sub
Sub Left(Text As String, Length As Long)As String 
   If length>text.Length Then length=text.Length 
   Return text.SubString2(0, length)
End Sub
Sub Right(Text As String, Length As Long) As String
   If length>text.Length Then length=text.Length 
   Return text.SubString(text.Length-length)
End Sub
Sub Mid(Text As String, Start As Int, Length As Int) As String 
   If Length>0 AND start>-1 AND start< text.Length Then Return text.SubString2(start,start+length)
End Sub

I may have left out a sub here or there, so don't hesitate to tell me. I have subscribed to this thread to get emails
 
Last edited:

netchicken

Active Member
Licensed User
Longtime User
Wow, I have been thinking over the last few days how to get started on a program that extracts information from a webpage. I was totally at a loss for a beginning point, now you post this!!

Thanks so much i look forward to working with it over the next week or so.

Gary
 

giammy

New Member
Licensed User
Longtime User
problem with parse html

Hi, I tried to use the example of httputils, but after you parse the html page and have obtained the string, then this is not updated and remains the value obtained from 1 parse html. how can I upgrade?


code:

Sub Globals
Dim b4a As String
b4a = "http://www.b4x.com"
End Sub

Sub Activity_Create (FirstTime As Boolean)
HttpUtils.CallbackActivity = "Main" 'Current activity name.
HttpUtils.CallbackJobDoneSub = "JobDone"
HttpUtils.Download("Job1", b4a)
End Sub

Sub JobDone (Job As String)
Dim s As String
If HttpUtils.IsSuccess(b4a) Then
s = HttpUtils.GetString(b4a)
End If
End Sub
 

NeoTechni

Well-Known Member
Licensed User
Longtime User
After "s = HttpUtils.GetString(b4a)" you didn't actually do anything with the HTML.
 

peacemaker

Expert
Licensed User
Longtime User
How to clean the HTML ? I mean extract just the plain text from it ?

So, it needs to delete all the tags, saving the text only.
Please, suggest modifications.
 

NeoTechni

Well-Known Member
Licensed User
Longtime User
Just remove the 2 select case structures for handling tags

Select Case Name.ToLowerCase

and

Select Case Name.ToLowerCase.Replace("/", "")
 

walterf25

Expert
Licensed User
Longtime User
HTML Parsing

Hi Neotechmi, i actually need some help parsing an HTML file, i'm working on this app where you can download movies, music, ebooks etc.. straight into your phone, but i'm having some problems parsing the content.

the url is this http://thepiratebay.se/top/all

can you maybe help out with this, i saw your module but i can't seem to follow it to modify it to my specific needs.

Thanks, and please let me know if you can help me out with this!

cheers,
Walter
 

NeoTechni

Well-Known Member
Licensed User
Longtime User
Sure I can help. You're just trying to extract a list of URLs?

B4X:
Sub GetTag(HTML As String, Tag As String) As String 
   Return GetBetween(HTML, " " &  Tag  & "=" & GPlus.vbQuote, GPlus.vbQuote)
End Sub

Sub EnumAHREFs(HTMLCode As String)As List 
   Dim temp As Int,temp2 As Int, htag As String  ,Name As String ,temp3 As String, Node As String
   Dim tempstr As String ,HREFS As List  ', tempstr As StringBuilder
   HREFS.Initialize 
   Do Until temp >= HTMLCode.Length OR temp<0 'OR tempstr.Length > MaxStringBuilderLength
      tempstr=""
      'Log(temp & "/" & HTMLCode.Length)
      If Mid(HTMLCode, temp,1) = "<" Then
         temp2=HTMLCode.IndexOf2(">", temp+1)
         htag=Mid(HTMLCode, temp,temp2-temp+1)
         temp=temp2+1
         Name=GetTagName(htag)
         Select Case Name.ToLowerCase 
            Case "a"', "script", "title", "h1", "h2", "h3", "header","footer","style"
               'If Not( name.EqualsIgnoreCase("a") AND htag.Contains(" name=") ) Then
                  temp3 = HTMLCode.IndexOf2("</" & Name, temp2)
                  Node=Mid(HTMLCode, temp2+1,temp3-temp2-1).Replace("&quot;", "'").Trim 
                  temp2=HTMLCode.IndexOf2(">", temp3+1)
                  temp=temp2+1
                  'Log("NODE2:" & Node)
               'End If
         End Select
         
         Select Case Name.ToLowerCase.Replace("/", "")
            Case "a"':      tempstr= tempstr & MakeLCARbutton(lcar.LCAR_Orange, node)
               'Log("HTML: " & htag)
               'Log("TAG: " & GetTagName(htag))
               HREFS.Add( htag )
         End Select
         
      Else
         temp2=HTMLCode.IndexOf2("<", temp+1)
         If temp2>-1 Then
            htag=Mid(HTMLCode, temp,temp2-temp).Trim
         Else
            temp2=HTMLCode.Length 
            htag=Right(HTMLCode, temp2-temp).Trim 
         End If
         temp=temp2
      End If
   Loop

   Return HREFS
End Sub
 

warwound

Expert
Licensed User
Longtime User
Hi Neotechmi, i actually need some help parsing an HTML file, i'm working on this app where you can download movies, music, ebooks etc.. straight into your phone, but i'm having some problems parsing the content.

the url is this The Pirate Bay - The galaxy's most resilient bittorrent site

can you maybe help out with this, i saw your module but i can't seem to follow it to modify it to my specific needs.

Thanks, and please let me know if you can help me out with this!

cheers,
Walter

Hi walterf25.

Did you follow up on my post to your thread here: http://www.b4x.com/forum/basic4andr.../25274-parsing-html-page-help.html#post146819?

I reckon a server-side proxy script would be far better and cope with badly written HTML - i can help with the PHP if required.

Martin.
 

NeoTechni

Well-Known Member
Licensed User
Longtime User
Turns out I forgot to post the GetBetween API... But that'd do most of it for you. I'll post it once I get home

You just need to find the start and end of the table
I replaced the HTML start/end brackets with { } so they'd show

Start: {table style="width:100%; font-size:11px"}
End: {/table}

Getbetween would get everything between those, which is the data you'd need
Then replace the tab character with nothing, to get rid of garbage data

Then you can regex.split the text on {/tr} which would separate it by row into an array which you can loop through, again using GetBetween on
{strong} and {/strong}
and {span} and {/span}

anytime there's nothing between {span} and {/span} you'd treat it as a label rather than a value
 

NeoTechni

Well-Known Member
Licensed User
Longtime User
B4X:
Sub GetBetween(Text As String, Start As String, Finish As String) As String 
   Dim temp As Int,temp2 As Int
   temp=Text.IndexOf(Start)
   If temp>-1 Then
      temp2=Text.IndexOf2(Finish, temp+ Start.Length  +1)
      Return Mid(Text, temp+Start.Length,temp2-temp-Start.Length)
   End If
End Sub

Sub Left(Text As String, Length As Long)As String 
   If Text.Length>0 AND Length>0 Then
      'If Length>Text.Length Then Length=Text.Length 
      Return Text.SubString2(0, Min(Text.Length,Length))
   End If
   Return ""
End Sub

Sub Right(Text As String, Length As Long) As String
   If Text.Length>0 AND Length>0 Then
      'If Length>Text.Length Then Length=Text.Length 
      Return Text.SubString(Text.Length-Min(Text.Length,Length))
   End If
   Return ""
End Sub
Sub Mid(Text As String, Start As Int, Length As Int) As String 
   If Length>0 AND Start>-1 AND Start< Text.Length Then Return Text.SubString2(Start,Start+Length)
End Sub
 

walterf25

Expert
Licensed User
Longtime User
Help parsing HTML

Hi Neo, i'm back trying to update my app, I need an easy way to parse a very bad formatted html file, I know you posted an example for me, but i have not been able to figure out how to make it work, i'm at it again, but i'm stuck at this function
B4X:
Sub GetTag(HTML As String, Tag As String) As String 
    Return GetBetween(HTML, " " &  Tag  & "=" & GPlus.vbQuote, GPlus.vbQuote)
End Sub

what exactly is GPlus.vbquote, is this another library i'm missing?

Can you point me in the right direction?

Thanks,
Walter
 

NeoTechni

Well-Known Member
Licensed User
Longtime User
Ah, my bad.

B4X:
dim VBquote as string = """"

I doubt my code works well with badly formatted code. I think I was lazy.
 

NeoTechni

Well-Known Member
Licensed User
Longtime User
I put VBquote in that library and didn't fix the reference.
It's for parsing Google Plus, Twitter and now Facebook
 
Top