VBA Linear regression vs Ramer-Douglas-Peucker_algorithm

It’s a bit annoying that wherever I look for it, linear regression is always beeing formulated as the minimization function of the vertical distances sum. This covers the casuistic where data is prominently in an horizontal distribution, so it has the best chance to assure that if range is below π/4, it will get the best line that fits the data. But, how about if data is spread in the half sector above π/4. Then the minimization function should look for the horizontal distance. Lets take a look for an ill conditioned dataset as the one pictured below: illConditionedDataOr figure a random set of points inside a vertical oriented ellipse (there is also some code to draw the ellipse perimeter). This kind of data set may never get a chance to be fitted if looking for vertical distance minimization. To solve this I’ve come to this code that solves the lack of criteria. It still can not do anything against three of the Anscombe’s quartet, but, you know, there should not be tried a linear regression against any ill conditioned data. And finally, but not the least, some code to compute Ramer-Douglas-Peucker (RDP) approximation to a dataset, very handy to reduce the number of points. I’d already developed an algorithm, not so CPU demanding, that also achieves similar reduction, but RPD is interesting as it’s not constrained by minimal length criteria… but it takes a life to compute some big sets of data.
Option Explicit

Public Const PI As Double = 3.14159265358979
Public Const PI_RAD As Double = PI / 180
Public Const EULER As Double = 2.71828182845905
Public Const g_BASE As Long = 0

Private Type tXYZ
    X As Double
    Y As Double
    Z As Double
End Type

Public Sub sLinearRegression()
    Dim oPoint() As tXYZ
    Dim lgItem As Long
    Dim aData As Variant
    Dim dbR²ToX As Double
    Dim dbR²ToY As Double
    Dim aRegressionV() As Double
    Dim aRegressionH() As Double

    aData = Selection.Value2
    ReDim oPoint(LBound(aData, 1) To UBound(aData, 1))
    For lgItem = LBound(aData, 1) To UBound(aData, 1)
        oPoint(lgItem).X = aData(lgItem, 1)
        oPoint(lgItem).Y = aData(lgItem, 2)
    Next lgItem

    aRegressionV() = fLinearRegression(oPoint())

    Erase oPoint()
    Erase aData
End Sub

Public Function fLinearRegressionRange(ByRef rgRangeData As Excel.Range) As Variant
    Dim oPoint() As tXYZ
    Dim lgItem As Long
    Dim aData As Variant

    aData = rgRangeData.Value2
    ReDim oPoint(LBound(aData, 1) To UBound(aData, 1))
    For lgItem = LBound(aData, 1) To UBound(aData, 1)
        oPoint(lgItem).X = aData(lgItem, 1)
        oPoint(lgItem).Y = aData(lgItem, 2)
    Next lgItem

    fLinearRegressionRange = fLinearRegression(oPoint())

    Erase oPoint()
    Erase aData
End Function

Public Function fLinearRegression(ByRef oPoint() As tXYZ) As Double()
' http://mathworld.wolfram.com/CorrelationCoefficient.html
' Check the Anscombe's quartet (ill conditioned data)
    'Y = ß0 + (ß1 * X)
    Dim lgItem As Long
    Dim lgPoints As Long
    Dim Xmean As Double
    Dim Ymean As Double
    Dim SSxx As Double
    Dim SSyy As Double
    Dim SSxy As Double
    'Dim SSR As Double
    'Dim SSE As Double
    Dim aReturnV(1 To 7) As Double
    Dim aReturnH(1 To 7) As Double

    For lgItem = LBound(oPoint) To UBound(oPoint)
        Xmean = Xmean + oPoint(lgItem).X
        Ymean = Ymean + oPoint(lgItem).Y
    Next lgItem
    lgPoints = (UBound(oPoint) - LBound(oPoint) + 1)
    Xmean = Xmean / lgPoints
    Ymean = Ymean / lgPoints

    SSxx = -(Xmean * Xmean) * lgPoints
    SSyy = -(Ymean * Ymean) * lgPoints
    SSxy = -(Xmean * Ymean) * lgPoints
    For lgItem = LBound(oPoint) To UBound(oPoint)
        ' For Correlation Coefficient
        SSxx = SSxx + (oPoint(lgItem).X * oPoint(lgItem).X)
        SSyy = SSyy + (oPoint(lgItem).Y * oPoint(lgItem).Y)
        SSxy = SSxy + (oPoint(lgItem).X * oPoint(lgItem).Y)
    Next lgItem

    ' Find SSR (sum of squared residuals, the quantity to minimize) and SSE (sum of squared errors)
    aReturnV(1) = SSxy / SSxx                    ' Slope --> ß1
    aReturnV(2) = Ymean - (aReturnV(1) * Xmean)  ' Origin --> ß0
    aReturnV(3) = SSxy ^ 2 / (SSxx * SSyy)       ' Correlation Coefficient --> R²
    aReturnV(4) = Xmean
    aReturnV(5) = Ymean
    aReturnV(6) = SSyy * (1 - aReturnV(3))   ' SSR = SSyy * (1 - R²)
    aReturnV(7) = SSyy - aReturnV(6)         ' SSE = ß1 * SSxy = ß1² * SSxx = SSyy - SSR

    ' Rotate the minimizer function 90º, to check ill conditioned data
    aReturnH(1) = SSxy / SSyy                    ' Slope --> ß1'
    aReturnH(2) = Xmean - (aReturnH(1) * Ymean)  ' Origin --> ß0'

    ' Get inverse correlation:
    aReturnH(1) = 1 / aReturnH(1)                ' ß1 = 1/ß1'
    aReturnH(2) = -aReturnH(2) * aReturnH(1)     ' ß0 = -ß0'* ß1

    aReturnH(3) = SSxy ^ 2 / (SSxx * SSyy)       ' Correlation Coefficient --> R²
    aReturnH(4) = Xmean
    aReturnH(5) = Ymean
    aReturnH(6) = SSxx * (1 - aReturnH(3))   ' SSR = SSxx * (1 - R²)
    aReturnH(7) = SSxx - aReturnH(6)         ' SSE = ß1 * SSxy = ß1² * SSyy = SSxx - SSR

    If aReturnH(6) > aReturnV(6) Then
        fLinearRegression = aReturnV()
    Else
        fLinearRegression = aReturnH()
    End If

    Erase aReturnV()
    Erase aReturnH()
End Function

Public Function fEllipseXL(ByRef aSemiAxis As Double, _
                           ByRef bSemiAxis As Double, _
                           ByRef Ø As Double, _
                           ByRef Xc As Double, _
                           ByRef Yc As Double, _
                           Optional ByRef ß As Double, _
                           Optional ByRef bFull As Boolean) As Double()
    Dim oCenter As tXYZ
    Dim oPoint() As tXYZ
    Dim aData() As Double
    Dim lgPoint As Long

    With oCenter
        .X = Xc
        .Y = Yc
    End With

    'oPoint() = fEllipse(aSemiAxis, bSemiAxis, Ø, oCenter, ß, bFull)
    oPoint() = fEllipsePolar(aSemiAxis, bSemiAxis, Ø, oCenter, ß, bFull)
    ReDim aData(LBound(oPoint) To UBound(oPoint), 1 To 2)
    For lgPoint = LBound(oPoint) To UBound(oPoint)
        aData(lgPoint, 1) = oPoint(lgPoint).X
        aData(lgPoint, 2) = oPoint(lgPoint).Y
    Next lgPoint

    fEllipseXL = aData()
End Function

Public Function fEllipsePolar(ByRef aSemiAxis As Double, _
                              ByRef bSemiAxis As Double, _
                              ByRef Ø As Double, _
                              ByRef oCenter As tXYZ, _
                              Optional ByRef ß As Double = 0, _
                              Optional ByRef bFull As Boolean = False) As tXYZ()
' For polar angles from center

    Dim lgAngle As Long
    Dim lgPoint As Long
    Dim dbRadius As Double
    Dim dbAngle As Double
    Dim oPoint() As tXYZ
    Dim oPointTmp As tXYZ
    Dim dbMajorAxis As Double
    Dim eccentricity As Double

    If bFull Then
        ' Fill initial values
        lgPoint = g_BASE - 1

        ' Select case major semiaxis
        If aSemiAxis >= bSemiAxis Then
            eccentricity = VBA.Sqr((aSemiAxis * aSemiAxis) - (bSemiAxis * bSemiAxis)) / aSemiAxis

            For lgAngle = 0 To 360 Step 10
                lgPoint = lgPoint + 1
                ReDim Preserve oPoint(0 To lgPoint)
                dbAngle = VBA.CDbl(lgAngle * PI_RAD)
                dbRadius = bSemiAxis / VBA.Sqr(1 - (eccentricity * VBA.Cos(dbAngle) ^ 2))
                oPoint(lgPoint).X = dbRadius * VBA.Cos(dbAngle)
                oPoint(lgPoint).Y = dbRadius * VBA.Sin(dbAngle)
            Next lgAngle

        ElseIf aSemiAxis < bSemiAxis Then
            eccentricity = VBA.Sqr((bSemiAxis * bSemiAxis) - (aSemiAxis * aSemiAxis)) / bSemiAxis

            For lgAngle = 0 To 360 Step 10
                lgPoint = lgPoint + 1
                ReDim Preserve oPoint(0 To lgPoint)
                dbAngle = VBA.CDbl(lgAngle * PI_RAD)
                dbRadius = aSemiAxis / VBA.Sqr(1 - (eccentricity * VBA.Cos(dbAngle) ^ 2))
                oPoint(lgPoint).X = dbRadius * VBA.Cos(dbAngle)
                oPoint(lgPoint).Y = dbRadius * VBA.Sin(dbAngle)
            Next lgAngle
        End If

        For lgPoint = LBound(oPoint) To UBound(oPoint)
            ' For a rotated ellipse:
            oPointTmp.X = (oPoint(lgPoint).X * Cos(Ø)) - (oPoint(lgPoint).Y * Sin(Ø)) + oCenter.X
            oPointTmp.Y = (oPoint(lgPoint).X * Sin(Ø)) + (oPoint(lgPoint).Y * Cos(Ø)) + oCenter.Y
            oPoint(lgPoint) = oPointTmp
        Next lgPoint
    Else
        ReDim oPoint(0)
    End If

    fEllipsePolar = oPoint()

End Function

Public Function fEllipse(ByRef aSemiAxis As Double, _
                         ByRef bSemiAxis As Double, _
                         ByRef Ø As Double, _
                         ByRef oCenter As tXYZ, _
                         Optional ByRef ß As Double = 0, _
                         Optional ByRef bFull As Boolean = False) As tXYZ()
' https://en.wikipedia.org/wiki/Ellipse
' Cannonical equation A·X² + B·X·Y + C·Y² + D·X + E·Y + F = 0
' canonical implicit equation: Xcan²/a² + Ycan²/b² = 1
' Xcan = (X - oCenter.X) * Cos(Ø) + (Y - oCenter.Y) * Sin(Ø)
' Ycan = -(X - oCenter.X) * Sin(Ø) + (Y - oCenter.Y) * Cos(Ø)
' Parametric form: (x,y)=(a*cos t,b*sin t) 0 <= t < 2*pi
' A = aSemiAxis² * (Sin(Ø))² + bSemiAxis² * (Cos(Ø))²
' B = 2 * (bSemiAxis² - aSemiAxis²) * Sin(Ø) * Cos(Ø)
' C = aSemiAxis² * (Cos(Ø))² + bSemiAxis² * (Sin(Ø))²
' D = -2·A·oCenter.X - B·oCenter.Y
' E = -B·oCenter.X - 2·C·oCenter.Y
' F = A·oCenter.X² + B·oCenter.X·oCenter.Y + C·oCenter.Y² - A²·B²
' ð = ((A·C) - (B²/4))·F + (B·E·D/4) - ((C·D²)/4) - ((A·E²)/4)

    Dim ð As Double 'discriminant for non degenerated ellipse
    Dim A As Double
    Dim B As Double
    Dim C As Double
    Dim D As Double
    Dim E As Double
    Dim F As Double
    Dim lgSegment As Long
    Dim lgPoint As Long
    Dim oPoint() As tXYZ
    Dim oPointTmp As tXYZ

    A = (aSemiAxis * Sin(Ø)) ^ 2 + (bSemiAxis * Cos(Ø)) ^ 2
    B = 2 * (bSemiAxis ^ 2 - aSemiAxis ^ 2) * Sin(Ø) * Cos(Ø)
    C = (aSemiAxis * Cos(Ø)) ^ 2 + (bSemiAxis * Sin(Ø)) ^ 2
    D = -2 * A * oCenter.X - B * oCenter.Y
    E = -B * oCenter.X - 2 * C * oCenter.Y
    F = (A * oCenter.X ^ 2) + (B * oCenter.X * oCenter.Y) + (C * oCenter.Y ^ 2) - (A * B) ^ 2

    ð = ((A * C) - (B ^ 2 / 4)) * F + (B * E * D / 4) - ((C * D ^ 2) / 4) - ((A * E ^ 2) / 4)

    If ð <= 0 Then ' not degenerated ellipse
        If bFull Then
            ReDim oPoint(0 To 40)
            ' Fill initial values
            lgPoint = -1
            For lgSegment = 0 To 19
                lgPoint = lgPoint + 1
                oPoint(lgPoint).X = -aSemiAxis + (aSemiAxis * lgSegment / 10)
                oPoint(lgPoint).Y = bSemiAxis * VBA.Sqr(1 - (oPoint(lgPoint).X / aSemiAxis) ^ 2)
            Next lgSegment

            lgPoint = lgPoint + 1
            oPoint(lgPoint).X = aSemiAxis
            oPoint(lgPoint).Y = 0

            For lgSegment = 19 To 1 Step -1
                lgPoint = lgPoint + 1
                oPoint(lgPoint).X = -aSemiAxis + (aSemiAxis * lgSegment / 10)
                oPoint(lgPoint).Y = -bSemiAxis * VBA.Sqr((1 - (oPoint(lgPoint).X / aSemiAxis) ^ 2))
            Next lgSegment

            lgPoint = lgPoint + 1
            oPoint(lgPoint) = oPoint(LBound(oPoint))

            For lgPoint = 0 To 40
                'For a rotated ellipse:
                oPointTmp.X = (oPoint(lgPoint).X * Cos(Ø)) - (oPoint(lgPoint).Y * Sin(Ø)) + oCenter.X
                oPointTmp.Y = (oPoint(lgPoint).X * Sin(Ø)) + (oPoint(lgPoint).Y * Cos(Ø)) + oCenter.Y
                oPoint(lgPoint) = oPointTmp
            Next lgPoint
        Else
            ReDim oPoint(0)
        End If

        fEllipse = oPoint()

    'ElseIf ð = 0 Then 'point ellipse
    '    ReDim oPoint(0)
    '    oPoint(0) = oCenter
    '    fEllipse = oPoint()

    End If

End Function

Private Function fDistanceToLine(ByVal px As Double, ByVal py As Double, _
                                 ByVal X1 As Double, ByVal Y1 As Double, _
                                 ByVal X2 As Double, ByVal Y2 As Double, _
                                 Optional ByRef t As Double) As Double
' Calculate the distance between the point and the segment.
' http://vb-helper.com/howto_distance_point_to_line.html
' https://en.wikipedia.org/wiki/Distance_from_a_point_to_a_line
' http://mathworld.wolfram.com/Point-LineDistance2-Dimensional.html
    Dim dX As Double
    Dim dY As Double

    dX = X2 - X1
    dY = Y2 - Y1
    If (dX = 0 And dY = 0) Then
    ' It's a point not a line segment.
        dX = px - X1
        dY = py - Y1
        near_x = X1
        near_y = Y1
        fDistanceToLine = VBA.Sqr(dX * dX + dY * dY)
        Exit Function
    End If

    ' Calculate the t that minimizes the distance.
    t = ((px - X1) * dX + (py - Y1) * dY) / (dX * dX + dY * dY)

    ' See if this represents one of the segment's end points or a point in the middle.
    If t  1 Then
        dX = px - X2
        dY = py - Y2
        near_x = X2
        near_y = Y2
    Else
        near_x = X1 + t * dX
        near_y = Y1 + t * dY
        dX = px - near_x
        dY = py - near_y
    End If

    fDistanceToLine = VBA.Sqr(dX * dX + dY * dY)
End Function

Public Function fDistance2DPoints(ByRef oPointA As tXYZ, ByRef oPointB As tXYZ) As Double
    Dim dX As Double
    Dim dY As Double

    dX = oPointA.X - oPointB.X
    dY = oPointA.Y - oPointB.Y

    fDistance2DPoints = VBA.Sqr((dX * dX) + (dY * dY))
End Function

Public Sub sDouglasPeucker()
    Dim lgItem As Long
    Dim lgR As Long
    Dim lgC As Long
    Dim aData As Variant
    Dim oPoint() As tXYZ
    Dim oPtFilter() As tXYZ
    Dim Threshold As Double

    aData = Selection.Value2
    ReDim oPoint(LBound(aData, 1) To UBound(aData, 1))
    For lgItem = LBound(aData, 1) To UBound(aData, 1)
        oPoint(lgItem).X = aData(lgItem, 1)
        oPoint(lgItem).Y = aData(lgItem, 2)
    Next lgItem

    Threshold = VBA.Val(VBA.InputBox("Threshold value", , 1))
    oPtFilter() = fDouglasPeucker(oPoint(), Threshold)

    'Show in Worksheet:
    With ActiveSheet
        lgR = Selection.Row - 1
        lgC = Selection.Column
        For lgItem = LBound(oPtFilter) To UBound(oPtFilter)
            lgR = lgR + 1
            .Cells(lgR, lgC + 2).Value2 = oPtFilter(lgItem).X
            .Cells(lgR, lgC + 3).Value2 = oPtFilter(lgItem).Y
        Next lgItem
    End With

    Erase oPoint()
    Erase aData
End Sub

Public Function fDouglasPeucker(ByRef oPoint() As tXYZ, _
                                Optional ByVal Threshold As Double = 0) As tXYZ()
' https://en.wikipedia.org/wiki/Ramer%E2%80%93Douglas%E2%80%93Peucker_algorithm
' Ramer–Douglas–Peucker algorithm (RDP)
' ToDo: Solve "crossing" segments (one/several vertex get trapped)...
' ToDo: if no more than one vertex get inside a segment, then if that vertex below threshold, can be deleted

    Dim oLine() As tXYZ
    Dim lgPoint As Long
    Dim lgSegment As Long
    Dim lgItem As Long
    Dim lgMove As Long
    Dim dbDistance As Double
    Dim dbDistanceMax As Double
    Dim dbCheck As Double
    Dim lgBreaker As Long
    Dim bClosest As Boolean
    Dim dbVectorMultiplier As Double
    Dim bBreaker As Boolean
    Dim bIterate As Boolean

    If Not (Not oPoint()) Then
        If Threshold = 0 Then
            fDouglasPeucker = oPoint()
            Exit Function
        End If

        ' Get K.P. for each point
        oPoint(LBound(oPoint)).Z = 0
        For lgPoint = (LBound(oPoint) + 1) To UBound(oPoint)
            oPoint(lgPoint).Z = oPoint(lgPoint - 1).Z + fDistance2DPoints(oPoint(lgPoint), oPoint(lgPoint - 1))
        Next lgPoint

        lgSegment = g_BASE + 1
        ReDim Preserve oLine(g_BASE + 0 To lgSegment)
        oLine(g_BASE + 0) = oPoint(LBound(oPoint))
        oLine(g_BASE + 1) = oPoint(UBound(oPoint))

        Do
            bIterate = False

            lgPoint = LBound(oPoint) ' initialze with first item
            For lgSegment = LBound(oLine) To UBound(oLine) - 1
                dbDistanceMax = Threshold
                bBreaker = False

                For lgPoint = (LBound(oPoint) + 1) To (UBound(oPoint) - 1) 'Avoid extremes, as they are already in the final set
                    dbDistance = fDistanceToLine(oPoint(lgPoint).X, oPoint(lgPoint).Y, _
                                                 oLine(lgSegment).X, oLine(lgSegment).Y, _
                                                 oLine(lgSegment + 1).X, oLine(lgSegment + 1).Y, _
                                                 dbVectorMultiplier)
                    ' First condition to apply only if is inside range: oLine(lgSegment).KP <= oNearestOnLine.KP = 0 And dbVectorMultiplier <= 1) Then
                        ' Second condition is to apply only if point is the closest to any other segment...
                        bClosest = True ' Initialize
                        For lgItem = LBound(oLine) To (UBound(oLine) - 1)
                            If lgItem  lgSegment Then ' avoid itself...
                            dbCheck = fDistanceToLine(oPoint(lgPoint).X, oPoint(lgPoint).Y, _
                                                      oLine(lgItem).X, oLine(lgItem).Y, _
                                                      oLine(lgItem + 1).X, oLine(lgItem + 1).Y, _
                                                      dbVectorMultiplier)
                                If (dbVectorMultiplier >= 0 And dbVectorMultiplier <= 1) Then ' Only if inside range...
                                    If dbCheck < dbDistance Then
                                        bClosest = False
                                        Exit For
                                    End If
                                End If
                            End If
                        Next lgItem

                        If bClosest Then
                            If dbDistanceMax  UBound(oPoint) Then Exit For ' only if points sorted...
            Next lgSegment
        Loop While bIterate

        fDouglasPeucker = oLine()
    End If
End Function

Public Function fDouglasPeuckerRange(ByRef rgRangeData As Excel.Range, _
                                     Optional ByVal Threshold As Double = 0) As Variant
    Dim lgItem As Long
    Dim aData As Variant
    Dim oPoint() As tXYZ
    Dim oPtFilter() As tXYZ

    aData = Selection.Value2
    ReDim Preserve oPoint(LBound(aData, 1) To UBound(aData, 1))
    For lgItem = LBound(aData, 1) To UBound(aData, 1)
        oPoint(lgItem).X = aData(lgItem, 1)
        oPoint(lgItem).Y = aData(lgItem, 2)
    Next lgItem

    Threshold = 1
    oPtFilter() = fDouglasPeucker(oPoint(), Threshold)
    ReDim aData(LBound(oPtFilter) To UBound(oPtFilter), 1 To 2)
    For lgItem = LBound(oPtFilter) To UBound(oPtFilter)
        aData(lgItem, 1) = oPtFilter(lgItem).X
        aData(lgItem, 2) = oPtFilter(lgItem).Y
    Next lgItem
    fDouglasPeuckerRange = aData

    Erase oPoint()
    Erase aData
End Function
[/sourcecode]

Leave a Reply

Your email address will not be published. Required fields are marked *