Free Knowledge

Roycefer · Dec 30, 2015

Expand your mind: http://lifehacker.com/download-thousands-of-free-technical-and-research-books-1750118225

In the above-linked article, there's a link to a GitHub page that lists a lot of the math and physics books available. I leave it as an exercise for the reader to write a B4J app that scrapes that GitHub page and automatically downloads all the books linked to on that page.

DonManfred · Dec 30, 2015

This gives all

1. It downloads all PDFs
2. it shows a list of all pdfs... If on is clicked it will be downloaded. This was made first...
I left it inside the code...

Here you go (B4J Source). The B4J-Project is attached

B4X:

Sub Process_Globals
    Private fx As JFX
    Private MainForm As Form
    Private Books As ListView
    Private queue As List
End Sub

Sub AppStart (Form1 As Form, Args() As String)
    queue.Initialize
    MainForm = Form1
    MainForm.SetFormStyle("UNIFIED")
    MainForm.RootPane.LoadLayout("Layout1") 'Load the layout file.
  
    Log("Path:"&DirUserDefault&"\FreeBooks\")
    File.MakeDir(DirUserDefault,"FreeBooks")
  
    Dim booklist As List = File.ReadList(File.DirAssets,"springer-free-maths-books.md")
    Log(booklist.Size)
    For i = 0 To booklist.Size-1
        Dim line As String = booklist.Get(i)
        If line.Contains("http://") Then
            Log(booklist.Get(i))          
            Dim url As String = line.SubString2(line.IndexOf("(")+1,line.IndexOf(")"))
            queue.Add(url)
            ' [A Classical Introduction to Modern Number Theory, Kenneth Ireland Michael Rosen](http://link.springer.com/content/pdf/10.1007/978-1-4757-1779-2.pdf)
            listviewAddOneLine(Books,line.SubString2(line.IndexOf("[")+1,line.IndexOf("]")),line.SubString2(line.IndexOf("(")+1,line.IndexOf(")")))
        End If
    Next
    MainForm.Show
    If queue.Size > 0 Then
        Dim j As HttpJob
        j.Initialize("GetBooks",Me)
        j.Tag = queue.Get(0)
        j.Download(queue.Get(0))
        Log("Downloading "&queue.Get(0))
    End If
  
End Sub

Sub Books_SelectedIndexChanged(Index As Int)
    Dim ap As AnchorPane = Books.Items.Get(Index)  
  
    For Each n As Node In ap.GetAllViewsRecursive
        If n Is Label Then
            Dim lbl As Label = n
            Dim j As HttpJob
            j.Initialize("GetBooks",Me)
            j.Tag = lbl.Tag
            j.Download(lbl.Tag)
            Log(lbl.Tag)
        End If
    Next
  
End Sub

Sub JobDone(job As HttpJob)
    If queue.Size > 0 Then
        queue.RemoveAt(0)
    End If
    If job.Success = True Then
        If job.JobName = "GetBook" Then
            Dim OutStream As OutputStream
      Log("DownloadReady: "&job.Tag)
      OutStream = File.OpenOutput(DirUserDefault&"\FreeBooks\", GetFilename(job.Tag), False) ' Job.Tag is read to set the Original Filename we specify earlier in the creation of the Job
      File.Copy2(job.GetInputStream,OutStream) ' save the file
      OutStream.Close
      Log(job.Tag&" written to "&DirUserDefault&"\FreeBooks\"&GetFilename(job.Tag))
        End If   
        If job.JobName = "GetBooks" Then
            Dim OutStream As OutputStream
      Log("QueueDownloadReady: "&job.Tag)
      OutStream = File.OpenOutput(DirUserDefault&"\FreeBooks\", GetFilename(job.Tag), False) ' Job.Tag is read to set the Original Filename we specify earlier in the creation of the Job
      File.Copy2(job.GetInputStream,OutStream) ' save the file
      OutStream.Close
      Log(job.Tag&" written to "&DirUserDefault&"\FreeBooks\"&GetFilename(job.Tag))
            If queue.Size > 0 Then
                Dim j As HttpJob
                j.Initialize("GetBooks",Me)
                j.Tag = queue.Get(0)
                j.Download(queue.Get(0))
                Log("Downloading "&queue.Get(0))
            End If

        End If   
  Else
        Log("Error: " & job.ErrorMessage)
  End If
    job.Release
  
End Sub
Sub GetFilename(fullpath As String) As String
   Return fullpath.SubString(fullpath.LastIndexOf("/") + 1)
End Sub

Sub DirUserDefault As String
   Dim jo As JavaObject
   Return jo.InitializeStatic("javax.swing.filechooser.FileSystemView").RunMethodJO("getFileSystemView", _
     Null).RunMethodJO("getDefaultDirectory", Null).RunMethod("getAbsolutePath", Null)
End Sub
Sub listviewAddOneLine(lv As ListView, Line1 As String, Value As Object)
  Dim ap As AnchorPane
    ap.Initialize("")
  Dim lbl1 As Label
  lbl1.Initialize("")
  lbl1.Text = Line1                
  lbl1.Font = fx.DefaultFont(16)
  lbl1.Tag = Value
  ap.AddNode(lbl1, 0, 0, lv.Width, 20dip)
  lv.Items.Add(ap)
End Sub

DonManfred · Dec 30, 2015

Roycefer said:
I leave it as an exercise for the reader to write a B4J app that scrapes that GitHub page and automatically downloads all the books linked to on that page.

See my last answer

But i decided to do not any reading of pages. I decided to get the gist and use the gist-content in my project.

DonManfred · Dec 30, 2015

btw: my copy is still running... Already downloaded 2,7GB of PDFs

Roycefer · Dec 30, 2015

This is the solution I used:

B4X:

Sub Process_Globals
    Private fx As JFX
    Private MainForm As Form
    Dim fileQueue As LinkedList
    Type fileEntry(link As String, fileName As String, title As String)
    Dim hj As HttpJob
    Dim gitHubLink As String = "https://gist.github.com/bishboria/8326b17bbd652f34566a"
    Dim htmlString As String
    Dim count As Int = 0
    Dim LS As String = GetSystemProperty("line.separator", CRLF)
End Sub

Sub AppStart (Form1 As Form, Args() As String)
    MainForm = Form1
    MainForm.SetFormStyle("UNIFIED")
    File.MakeDir(File.DirApp, "books")
    'MainForm.RootPane.LoadLayout("Layout1") 'Load the layout file.
    'MainForm.Show
    hj.Initialize("hj", Me)
    hj.Tag = "htmlStringDL"
    hj.Download(gitHubLink)
  
'    ParseAndFillFileQueueFromProblemList
'    hj.Tag = "files"
'    Dim tfe As fileEntry = fileQueue.Get(0)
'    hj.Download(tfe.link)
End Sub

Sub JobDone(Job As HttpJob)
    If Job.Success Then
        If Job.Tag=="htmlStringDL" Then
            htmlString = Job.GetString
            Log("Parsing github link")
            ParseHTMLAndFillFileQueue
        Else
            Dim thisFE As fileEntry = fileQueue.Get(0)
            Try
                Dim newFileName As String = thisFE.title.Replace("""", "").Replace("?", "").Replace("*", "_star").Replace("+", "_plus") _
                    .Replace(" ", "_").Replace(",", "").Replace(":","") & "_" & thisFE.fileName
                Dim inpstr As InputStream = Job.GetInputStream
                Dim outstr As OutputStream = File.OpenOutput(File.DirApp & "/books", newFileName , False)
                File.Copy2(inpstr, outstr)
                outstr.Flush
                outstr.Close  
            Catch
                Log(LastException.Message)
                Dim tw As TextWriter
                tw.Initialize(File.OpenOutput(File.DirApp, "ProblemList.txt", True))
                tw.Write(thisFE.link & ";" & thisFE.fileName & ";" & thisFE.title & LS)
                tw.Flush
                tw.Close
            End Try
          
            fileQueue.RemoveFirst
            count = count + 1
            Log("count: " & count & " fileQueue.Size: " & fileQueue.Size)
        End If
    Else
        Log("Failed Download")  
        fileQueue.RemoveFirst  
    End If
    hj.Tag = "files"
    If fileQueue.Size>0 Then
        Dim nextFile As fileEntry = fileQueue.Get(0)
        hj.Download(nextFile.link)
    Else
        Log("Done")
    End If
    Job.Release
End Sub

Sub ParseHTMLAndFillFileQueue
    fileQueue.Initialize
    fileQueue.Clear
    Dim ahrefOpen As String = "<a href="""
    Dim ahrefClose As String = """>"
    Dim aclose As String = "</a>"
    Dim index As Int = htmlString.IndexOf2(ahrefOpen, 0)
    Dim tw As TextWriter
    tw.Initialize(File.OpenOutput(File.DirApp, "FEList.txt", True))
    Do While index>-1
        Dim thisFE As fileEntry
        thisFE.Initialize
        index = index + ahrefOpen.Length
        thisFE.link = htmlString.SubString2(index, htmlString.IndexOf2(ahrefClose, index))
'        Log("link: " & thisFE.link)
        If thisFE.link.EndsWith(".pdf") Then
            thisFE.fileName = thisFE.link.SubString(thisFE.link.LastIndexOf("/")+1)
'            Log("fileName: " & thisFE.fileName)
            index = htmlString.IndexOf2(ahrefClose, index) + ahrefClose.Length
            thisFE.title = htmlString.SubString2(index, htmlString.IndexOf2(aclose, index))
'            Log("title: " & thisFE.title)
            index = htmlString.IndexOf2(aclose, index)
'            If thisFE.title.Contains(":") Then
                fileQueue.Add(thisFE)
                tw.Write(thisFE.link & ";" & thisFE.fileName & ";" & thisFE.title & LS)
                tw.Flush
'            End If
'            Log(fileQueue.Size)
        End If
        index = htmlString.IndexOf2(ahrefOpen, index)      
    Loop
    tw.Close
    Log("Done filling fileQueue, size: " & fileQueue.Size)
End Sub

Sub ParseAndFillFileQueueFromProblemList
    Dim strList As List = File.ReadList(File.DirApp, "ProblemList.txt")
    fileQueue.Initialize
    fileQueue.Clear
    For Each str As String In strList
        Dim thisFE As fileEntry
        thisFE.Initialize
        thisFE.link = Regex.Split(";", str)(0)
        thisFE.fileName = Regex.Split(";", str)(1)
        thisFE.title = Regex.Split(";", str)(2)
        fileQueue.Add(thisFE)
    Next
    Log("Done filling fileQueue, size: " & fileQueue.Size)
End Sub

It renames the files to have the title/author in the file name as well as saving problematic links to text file so you can go back later and redownload them. In all, it ended up downloading some 11+ GB. That's a lot of knowledge.

Free Knowledge

Roycefer

Well-Known Member

DonManfred

Expert

Attachments

DonManfred

Expert

DonManfred

Expert

Roycefer

Well-Known Member