HTML Table to XLS

Just a little parser. It is also shown how to delete an HTML Element: The program requires references to the following:
  • Microsoft Internet Controls
  • Microsoft Shell Controls and Automation (for Internet Explorer Medium)
  • Microsoft HTML Object Library
But it could also be done with late binding (Object declaration)
Option Explicit

'The program requires references to the following:
'1 Microsoft Internet Controls (SHDocVw)
'2. Microsoft HTML Object Library
'3. Microsoft Shell Controls And Automation

Dim oIE As Object 'SHDocVw.InternetExplorer 'SHDocVw.InternetExplorerMedium 'Object: to refer to the running copy of Internet Explorer
Dim hDoc As MSHTML.HTMLDocument 'Object: to refer to the HTML document returned
Dim strURL As String
Dim hElement As MSHTML.IHTMLElement
Dim hCollection As MSHTML.IHTMLElementCollection
Dim hNode As MSHTML.IHTMLDOMNode
Dim hDOMDoc As MSHTML.IHTMLDOMNode2
Dim oWbk As Excel.Workbook
Dim oWsh As Excel.Worksheet

Private Type tRef
Link As String
File As String
NewFile As String
End Type

Private Sub sPurge()
Dim strPath As String
Dim strFile As String
Dim strDoc As String
Dim strID As String
Dim iFile As Integer
Dim aFile() As String
Dim lgFile As Long
Dim strSrc As String
Dim lgItem As Long
Dim lghRef As Long
Dim bFound As Boolean
Dim ahRef() As tRef
Dim hNewDoc As MSHTML.HTMLDocument
Dim hNewHead As MSHTML.HTMLHeadElement
Dim hNewElement As MSHTML.IHTMLElement
Dim hChildrenCollection As MSHTML.IHTMLElementCollection 'Object
Dim lgR As Long
Dim strHTML As String
Dim strBookmark As String

Call fIE_Initialize(True)

lgFile = -1
strPath = VBA.Environ$("UserProfile") & "\Downloads\"
strFile = VBA.Dir(strPath & "*.htm")
Do
lgFile = lgFile + 1
ReDim Preserve aFile(0 To lgFile)
aFile(lgFile) = strFile 'strPath &
strFile = Dir
Loop Until strFile = vbNullString
'Stop

ahRef() = fRangeValues ' get links
lgR = 0

For lgFile = LBound(aFile) To UBound(aFile)
strFile = strPath & aFile(lgFile)

strURL = "file:///" & VBA.Replace(strFile, "\", "/")
If fIE_Load(strURL, True) Then
Call fIE_Document
'Set hDOMDoc = oIE.Document.DOMDocument
'-------------------------
Set hNewDoc = New MSHTML.HTMLDocument
Set hNewHead = hNewDoc.head
'Set hNewHead = hNewDoc.createElement("head")

'Save <title>
hNewDoc.title = hDoc.title

' Save all <meta>
Set hCollection = hDoc.getElementsByTagName("meta")
If hCollection.Length = 0 Then Stop
For Each hElement In hCollection
' Avoid Copyright
If hElement.outerHTML Like "*Copyright *" Then
ElseIf hElement.outerHTML Like "*copyright*" Then
Else
Set hNewElement = hNewDoc.createElement(hElement.outerHTML)
hNewHead.appendChild hNewElement
End If
Next hElement
'End If

Set hNewElement = hNewDoc.createElement("<meta name=""copyright"" content=""<a href=http://www.xxx.com title=xxx.com>xxx.com</a>"">")
hNewHead.appendChild hNewElement
Set hNewElement = hNewDoc.createElement("<meta name=copyright content='Copyright (c) by xxx'>")
hNewHead.appendChild hNewElement
Set hNewElement = hNewDoc.createElement("<meta name=author content='xxx'>")
hNewHead.appendChild hNewElement

' Delete all <script>
Set hCollection = hDoc.getElementsByTagName("script")
If hCollection.Length = 0 Then Stop
For Each hElement In hCollection
Call fDeleteElement(hElement)
Next hElement
'End If

Set hNewElement = hNewDoc.createElement("<script src="".\js\script.js"" type=""text/javascript""></script>")
hNewHead.appendChild hNewElement

Set hNewElement = hNewDoc.createElement("<link href="".\img\favicon.ico"" rel=""shortcut icon"" type=""image/x-icon"">")
hNewHead.appendChild hNewElement

Set hCollection = hDoc.getElementsByTagName("link")
If hCollection.Length = 0 Then Stop
For Each hElement In hCollection
' Avoid icons and Google scripts
If hElement.outerHTML Like "*googlesyndication*" Then
ElseIf hElement.outerHTML Like "*adservice.google*" Then
Else
hNewHead.appendChild hNewElement
End If
Next hElement
'End If
Set hNewElement = hNewDoc.createElement("<link href="".\css\styles.css"" rel=""StyleSheet"" type=""text/css"">")
hNewHead.appendChild hNewElement

'----------------------------
strHTML = hNewDoc.head.innerHTML
strHTML = VBA.Replace(strHTML, "<META ", "<meta ", , , vbTextCompare)
strHTML = VBA.Replace(strHTML, "><title>", ">" & vbLf & "<title>", , , vbTextCompare)
strHTML = VBA.Replace(strHTML, "><link ", ">" & vbLf & "<link ", , , vbTextCompare)
strHTML = VBA.Replace(strHTML, "head><", "head>" & vbLf & "<", , , vbTextCompare)
strHTML = VBA.Replace(strHTML, "></head", ">" & vbLf & "</head", , , vbTextCompare)
strHTML = VBA.Replace(strHTML, "><body", ">" & vbLf & "<body", , , vbTextCompare)
strHTML = VBA.Replace(strHTML, "></body", ">" & vbLf & "</body", , , vbTextCompare)
strHTML = VBA.Replace(strHTML, "><table ", ">" & vbLf & "<table ", , , vbTextCompare)

hDoc.head.innerHTML = strHTML

'------------------ 1st phase
' Get (and Set) relevant HTML --> <div id="main"
Set hElement = Nothing
'Do While hElement Is Nothing
Set hElement = hDoc.getElementById("main") 'this is the relevant content
hDoc.body.innerHTML = hElement.innerHTML
' DoEvents
'Loop

' Delete node: <div id="blocks"> ' if inside
Set hNode = hDoc.getElementById("blocks")
If Not (hNode Is Nothing) Then
hNode.ParentNode.RemoveChild hNode
End If

' Delete content: <ul class="forum">
Set hCollection = hDoc.getElementsByClassName("forum")
If hCollection.Length > 0 Then
For Each hElement In hCollection
Set hElement = hElement.parentElement
Set hElement = hElement.parentElement
hElement.innerHTML = ""
Next hElement
End If
'------------------ 2nd phase

' Get new HTML body --> <body>, set attribute: id="main"
Set hCollection = hDoc.getElementsByTagName("body")
If hCollection.Length = 0 Then Stop
For Each hElement In hCollection
hElement.setAttribute "id", "main"
Next hElement

Set hCollection = hDoc.getElementsByTagName("img")
If hCollection.Length = 0 Then Stop
For Each hElement In hCollection
strSrc = hElement.getAttribute("src")

strSrc = VBA.Replace(strSrc, "https://xxx.com/images/", ".\img\")

If strSrc <> vbNullString Then
hElement.setAttribute "src", strSrc
End If
Next hElement
'Stop
' Extract all links
Set hCollection = hDoc.getElementsByTagName("a")
For Each hElement In hCollection
'!!!!!!!!!
On Error Resume Next
strSrc = ""
strSrc = hElement.getAttribute("href")
'strSrc = VBA.Mid$(strSrc, 1, VBA.InStr(1, strSrc, "&name", vbTextCompare) - 1)

If VBA.InStr(1, strSrc, "#") > 0 Then
strBookmark = VBA.Mid$(strSrc, VBA.InStr(1, strSrc, "#"))
strSrc = VBA.Mid$(strSrc, 1, VBA.Len(strSrc) - VBA.Len(strBookmark))
End If

'Stop
On Error GoTo 0
'!!!!!!!!!
bFound = False
'If Not (Not ahRef) Then
' For lgItem = LBound(ahRef) To UBound(ahRef)
' If strSrc Like ahRef(lgItem).File & "*" Then
' bFound = True
' Exit For
' End If
' Next lgItem
'End If

' If Not bFound Then
'!!!!!!!!!
'Stop
'!!!!!!!!!
' Else
' store / download link...
'lghRef = UBound(ahRef) + 1
'lgR = lgR + 1
'Cells(lgR, 1).Value2 = strSrc
''lghRef = UBound(ahRef)
''ReDim Preserve ahRef(0 To lghRef)
'ahRef(lghRef) = strSrc

' Get between "open" and "close"
'strSrc = VBA.Mid$(strSrc, VBA.InStr(1, strSrc, "open") + VBA.Len("open"))
'strSrc = VBA.Mid$(strSrc, 1, VBA.InStr(1, strSrc, "close") - 1)
'If Error:
'Cells(lgR, 1).Value2 = "*" & aFile(lgFile)
If strSrc <> vbNullString Then
strSrc = strSrc & ".htm" & strBookmark
End If
' End If

If strSrc <> vbNullString Then
hElement.setAttribute "href", strSrc '& ".htm" & strBookmark
End If
Next hElement

'------------------ 3th phase

bFound = False
'If Not (Not ahRef) Then
'For lgItem = LBound(ahRef) To UBound(ahRef)
' If aFile(lgFile) = ahRef(lgItem).File Then
' bFound = True
' Exit For
' End If
'Next lgItem
'End If
'If bFound Then
' strFile = strPath & ahRef(lgItem).NewFile & ".htm"
' 'Name strPath & aFile(lgFile) As strFile
' 'Kill strPath & aFile(lgFile)
'Else
' Stop
'End If
iFile = VBA.FreeFile()
Open strFile For Output Shared As #iFile
Print #iFile, hDoc.DocumentElement.innerHTML
Close #iFile
End If
Next lgFile

ExitProc:
Set hElement = Nothing
Set hNode = Nothing
Set hNewDoc = Nothing
'Call fIE_Terminate
Exit Function
ErrControl:
'Handle Error
Resume ExitProc
End Function

'Dim doc As mshtml.IHTMLDocument2 = WebBrowser1.Document.DomDocument
'Dim child As mshtml.HTMLHtmlElement = doc.childNodes(0)
'Dim domNode As mshtml.IHTMLDOMNode2 = doc
'domNode.RemoveChild (child)

Private Function fRangeValues() As tRef()
Dim aValue() As tRef
Dim lgItem As Long
Dim oWsh As Excel.Worksheet
Dim rgCellTypeFormulas As Excel.Range
Dim rgCellTypeConstants As Excel.Range
Dim rgCellTypeFormat As Excel.Range
Dim rgSelection As Excel.Range
Dim rgFormated As Excel.Range
Dim oCells As Excel.Range
Dim oCell As Excel.Range
Dim bFormated As Boolean

Set oWsh = ActiveSheet
With oWsh
'Selecting hardcoded data and formulas
On Error Resume Next
Set rgCellTypeConstants = .Cells.SpecialCells(xlCellTypeConstants)
Set rgCellTypeFormulas = .Cells.SpecialCells(xlCellTypeFormulas)
'Set rgCellTypeFormat = .Cells.SpecialCells(xlCellTypeAllFormatConditions)
On Error GoTo 0

If rgCellTypeFormulas Is Nothing And Not rgCellTypeConstants Is Nothing Then
Set oCells = rgCellTypeConstants
ElseIf rgCellTypeConstants Is Nothing And Not rgCellTypeFormulas Is Nothing Then
Set oCells = rgCellTypeFormulas
ElseIf Not rgCellTypeConstants Is Nothing And Not rgCellTypeFormulas Is Nothing Then
Set oCells = Union(rgCellTypeConstants, rgCellTypeFormulas)
End If
Set rgSelection = Selection
Set oCells = Intersect(oCells, rgSelection)

If Not oCells Is Nothing Then
oCells.Select

lgItem = -1
ReDim aValue(0 To oCells.Count - 1)
For Each oCell In oCells
lgItem = lgItem + 1
aValue(lgItem).Link = oCell.Value2
aValue(lgItem).File = oCell.Offset(0, 2).Value2
aValue(lgItem).NewFile = oCell.Offset(0, 3).Value2
Next oCell
End If
End With

fRangeValues = aValue()
Erase aValue()
End Function

Private Sub sHTML_Load()
Dim lgChild As Long
Dim lgElement As Long
Dim lgR As Long
Dim lgC As Long

Dim oWsh As Excel.Worksheet
Dim rgCellTypeFormulas As Excel.Range
Dim rgCellTypeConstants As Excel.Range
Dim rgCellTypeFormat As Excel.Range
Dim rgSelection As Excel.Range
Dim rgFormated As Excel.Range
Dim oCells As Excel.Range
Dim oCell As Excel.Range
Dim bFormated As Boolean
Dim iFile As Integer
Dim strFile As String

'!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
MsgBox "If you're going to download, declare as InternetExplorer, if you're going to depureate as InternetExplorerMedium"
'!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

Call fIE_Initialize

Set oWsh = ActiveSheet
With oWsh
'Selecting hardcoded data and formulas
On Error Resume Next
Set rgCellTypeConstants = .Cells.SpecialCells(xlCellTypeConstants)
Set rgCellTypeFormulas = .Cells.SpecialCells(xlCellTypeFormulas)
'Set rgCellTypeFormat = .Cells.SpecialCells(xlCellTypeAllFormatConditions)
On Error GoTo 0

If rgCellTypeFormulas Is Nothing And Not rgCellTypeConstants Is Nothing Then
Set oCells = rgCellTypeConstants
ElseIf rgCellTypeConstants Is Nothing And Not rgCellTypeFormulas Is Nothing Then
Set oCells = rgCellTypeFormulas
ElseIf Not rgCellTypeConstants Is Nothing And Not rgCellTypeFormulas Is Nothing Then
Set oCells = Union(rgCellTypeConstants, rgCellTypeFormulas)
End If
Set rgSelection = Selection
Set oCells = Intersect(oCells, rgSelection)

lgR = 0
If Not oCells Is Nothing Then
oCells.Select

For Each oCell In oCells
'If oWsh.Rows(oCell.Row).OutlineLevel > 4 Then
If VBA.Trim$(oCell.Value2) <> vbNullString And _
Not VBA.Trim$(oCell.Value2) Like "Link to *" Then
'strFile = VBA.Trim$(VBA.Replace(oCell.Offset(0, -4).Value2, "/", "_"))
'strFile = VBA.Environ$("UserProfile") & "\Downloads\" & strFile & ".htm"
'If FileExists(strFile) Then
strURL = oCell.Value2

If fIE_Load(strURL) Then
Call fIE_Document

'strFile = VBA.Trim$(VBA.Replace(oCell.Offset(0, -4).Value2, "/", "_"))
'strFile = VBA.Trim$(VBA.Replace(oCell.Offset(0, 2).Value2, "/", "_"))
'strFile = VBA.Trim$(VBA.Replace(oCell.Offset(0, 1).Value2, "/", "_"))
strFile = oCell.Offset(0, 2).Value2 & ".htm"

'Selectors: ------------------------------------------------------
'.getElementById --> as IHTMLElement
'.getElementsByClassName --> as IHTMLElementCollection
'.getElementsByName --> as IHTMLElementCollection
'.getElementsByTagName --> as IHTMLElementCollection
'.getElementsByTagNameNS --> as IHTMLElementCollection

'Set hChildrenCollection = hDoc.querySelectorAll("#images img[src^='img/']")
'
'.querySelectorAll("button span")(0).innerText = "polskiego"
'.querySelectorAll("button span")(1).innerText = "angielskiego"
'
'.querySelectorAll("button[class = ""btn dropdown-toggle""]")(0).innerText = "chinskiego"
'.querySelectorAll("button[class = ""btn dropdown-toggle""]")(1).innerText = "bulgarski"
'
'.querySelectorAll("button span")(0).innerText = "chinskiego"
'.querySelectorAll("button span")(1).innerText = "chinskiego"
'
'.querySelectorAll("button.btn.dropdown-toggle span")(0).innerText = "chinskiego"
'.querySelectorAll("button.btn.dropdown-toggle span")(1).innerText = "arabskiego"
' -----------------------------------------------------------------

' Delete content:
'Set hCollection = hDoc.getElementsByClassName("...")
'For Each hElement In hCollection
' hElement.innerHTML = ""
'Next hElement

' Get (and Set) relevant HTML
'hDoc.DocumentElement.innerHTML = hDoc.getElementById("npy_celda_texto").innerHTML 'this is the relevant content
iFile = VBA.FreeFile()
strFile = VBA.Environ$("UserProfile") & "\Downloads\" & strFile '& ".htm"
Open strFile For Output Shared As #iFile
Print #iFile, hDoc.DocumentElement.innerHTML
Close #iFile

' Ban control...
If FileLen(strFile) > 10000 Then
oCell.Hyperlinks.Add Anchor:=oCell, Address:=strFile
Else
Stop
End If
End If
'Stop
'Else
' oCell.Hyperlinks.Add Anchor:=oCell, Address:=strFile
'End If
End If
'End If
Next oCell
End If
End With

'hDoc.all.UserName.value = " "
'hDoc.all.Password.value = " "
'For Each hElement In hDoc.getElementsByTagName("...")
' If hElement.Type = "submit" Then hElement.Click: Exit For
'Next

'Stop

ExitProc:
Call fIE_Terminate
End Sub

Private Sub sHTMLTableToXLS() 'byVal hTblColl As IHTMLElementCollection)
'Call fIE_Initialize
'Call fIE_Load(strURL)
'Call fIE_Document()

'HTML Table to Excel
Dim hTblColl As MSHTML.IHTMLElementCollection 'Object
Dim hBdyColl As MSHTML.IHTMLElementCollection 'Object
Dim hTRColl As MSHTML.IHTMLElementCollection 'Object
Dim hTDColl As MSHTML.IHTMLElementCollection 'Object
'Dim hTbl As MSHTML.IHTMLElement 'Object ' table
Dim hBdy As MSHTML.IHTMLElement 'Object ' body
'Dim hRow As MSHTML.IHTMLElement 'Object ' row
'Dim hCell As MSHTML.IHTMLElement 'Object ' cell
Dim hTbl As MSHTML.HTMLTable
Dim hRow As MSHTML.HTMLTableRow
Dim hCell As MSHTML.HTMLTableCell

Dim strText As String
Dim lgC As Long
Dim lgR As Long

Set hTblColl = hDoc.getElementsByTagName("table")

lgC = 0 'Column A in Excel
lgR = 0 'Row 1 in Excel

For Each hTbl In hTblColl
If VBA.UCase$(hTbl.className) = VBA.UCase$("...") Then
For Each hRow In hTbl.Rows
lgR = lgR + 1
lgC = 0 ' Resets back to column A
For Each hCell In hRow.Cells
lgC = lgC + 1
oWsh.Cells(lgR, lgC).Value = hCell.innerText
'Debug.Print hCell.innerText
Next hCell
Next hRow
End If

'Set hBdyColl = hTbl.getElementsByTagName("tbody")
'For Each hBdy In hBdyColl
' Set hTRColl = hBdy.getElementsByTagName("tr")
' For Each hRow In hTRColl
' lgR = lgR + 1
' lgC = 0 ' Resets back column
' Set hTDColl = hRow.getElementsByTagName("td")
' For Each hCell In hTDColl
' oWsh.Cells(lgR, lgC).value = hCell.innerText
' lgC = lgC + 1
' Next hCell
' DoEvents
' Next hRow
' 'Exit For
'Next hBdy
'Exit For
Next hTbl
End Sub

Private Function fDeleteElement(ByRef hElement As MSHTML.IHTMLElement) As Boolean ', _
ByRef hDoc As MSHTML.HTMLDocument) As Boolean

Dim strIdDelete As String

On Error GoTo ErrControl
strIdDelete = "Delete"
Do
strIdDelete = strIdDelete & "_"
Set hNode = hDoc.getElementById(strIdDelete)
Loop Until (hNode Is Nothing)
hElement.setAttribute "id", strIdDelete
Set hNode = hDoc.getElementById(strIdDelete)
hNode.ParentNode.RemoveChild hNode
fDeleteElement = True

ExitProc:
On Error GoTo 0
Exit Function

ErrControl:
GoTo ExitProc
End Function

Private Function DelTagById(ByRef strData As String, _
ByVal strID As String) As String

On Local Error GoTo MyError

Dim hDoc As HTMLDocument
Dim hNode As IHTMLDOMNode

DelTagById = strData
If strID = "" Then GoTo MyExit

Set hDoc = New HTMLDocument
hDoc.body.innerHTML = strData

Set hNode = hDoc.getElementById(strID)
If hNode Is Nothing Then GoTo MyExit
hNode.ParentNode.RemoveChild hNode

DelTagById = hDoc.body.innerHTML

MyExit:
Set hNode = Nothing
Set hDoc = Nothing
Exit Function
MyError:
'Handle Error
Resume MyExit
End Function

Private Function fIE_Initialize(Optional ByRef bIEMedium As Boolean = False) As Boolean
Set oWbk = Excel.ActiveWorkbook
Set oWsh = oWbk.ActiveSheet

'open Internet Explorer in memory, and go to website
'Dim oIE As New InternetExplorer
'If oIE Is Nothing Then
If bIEMedium Then
'Set oIE = New InternetExplorerMedium
Set oIE = CreateObject("new:{D5E8041D-920F-45e9-B8FB-B1DEB82C6E5E}") 'Internet Explorer Medium
Set oIE = GetObject("new:{D5E8041D-920F-45e9-B8FB-B1DEB82C6E5E}") 'Internet Explorer Medium
Else
Set oIE = CreateObject("InternetExplorer.Application")
'Set oIE = New InternetExplorer
End If
'End If
oIE.Visible = True

fIE_Initialize = Not (oIE Is Nothing)
End Function

Private Function fIE_Load(Optional ByVal strURL As String = vbNullString, _
Optional ByVal bMedium As Boolean = False) As Boolean
Dim Timer As Date

If strURL = vbNullString Then
strURL = VBA.InputBox("URL:", , "https://...")
End If

'Wait until IE is done loading page
Err.Clear
'On Error Resume Next
oIE.navigate strURL '"http://", , , , "Content-Type: application/x-www-form-urlencoded" & vbCrLf
Timer = VBA.Now()

'On Error Resume Next
While oIE.Busy
Application.StatusBar = "Trying to connect..."
DoEvents
Wend

If bMedium Then
Application.Wait (VBA.Now + TimeValue("0:00:05"))
Else
Do While oIE.readyState <> READYSTATE_COMPLETE
Application.StatusBar = "Trying to connect..."
DoEvents
'If (VBA.Now() - Timer) > 10000 Then Stop: End
Loop
End If
If Err.Number = 462 Then
If strURL Like "file:///*" Then
fIE_Load = True
Else
'Stop: End
fIE_Load = False
End If
Else
fIE_Load = True
End If
Err.Clear
On Error GoTo 0

Application.StatusBar = ""
End Function

Private Function fIE_Document() As Boolean
'show text of HTML document returned
Set hDoc = oIE.Document

'save to file: hDoc.DocumentElement.innerHTML

' Also: ------------------------
''Create HTMLFile Object
'Set hDoc = CreateObject("htmlfile")
'
''Get the WebPage Content to HTMLFile Object
'With CreateObject("msxml2.xmlhttp")
' .Open "GET", strURL, False
' .send
' '??? hDoc.body.innerHTML = .responseText
'End With
' ------------------------------

End Function

Private Function fIE_Terminate() As Boolean 'ByVal oIE As InternetExplorer) 'Object

Set hDoc = Nothing

'close down IE and reset status bar
' oIE.Refresh ' Refresh if needed
Set oIE = Nothing

Application.StatusBar = ""

Err_Clear:
If Err <> 0 Then
Debug.Assert Err = 0
Err.Clear
'Resume Next
End If
End Function

Function FileExists(ByVal fullFileName As String) As Boolean
FileExists = VBA.Len(VBA.Dir(fullFileName, vbArchive)) > 0
End Function

Public Function OpenObject(ByVal strFSO As String) As Boolean
' Open files and/or folders and/or websites/or create emails
'strFSO = "C:\Test Files\" & "File.xls"
' = "http://www.siteURL.com/"
' = "mailto:YourEmailHere@Website.com" & "?subject=Test"

ActiveWorkbook.FollowHyperlink Address:=strFSO, NewWindow:=True
End Function

Leave a Reply

Your email address will not be published. Required fields are marked *