VBA Linear regression vs Ramer-Douglas-Peucker_algorithm

It’s a bit annoying that wherever I look for it, linear regression is always beeing formulated as the minimization function of the vertical distances sum. This covers the casuistic where data is prominently in an horizontal distribution, so it has the best chance to assure that if range is below π/4, it will get the best line that fits the data.

But, how about if data is spread in the half sector above π/4. Then the minimization function should look for the horizontal distance.

Lets take a look for an ill conditioned dataset as the one pictured below:
illConditionedDataOr figure a random set of points inside a vertical oriented ellipse (there is also some code to draw the ellipse perimeter).

This kind of data set may never get a chance to be fitted if looking for vertical distance minimization.

To solve this I’ve come to this code that solves the lack of criteria. It still can not do anything against three of the Anscombe’s quartet, but, you know, there should not be tried a linear regression against any ill conditioned data.

And finally, but not the least, some code to compute Ramer-Douglas-Peucker (RDP) approximation to a dataset, very handy to reduce the number of points. I’d already developed an algorithm, not so CPU demanding, that also achieves similar reduction, but RPD is interesting as it’s not constrained by minimal length criteria… but it takes a life to compute some big sets of data.

Option Explicit

Public Const PI As Double = 3.14159265358979
Public Const PI_RAD As Double = PI / 180
Public Const EULER As Double = 2.71828182845905
Public Const g_BASE As Long = 0

Private Type tXYZ
X As Double
Y As Double
Z As Double
End Type

Public Sub sLinearRegression()
Dim oPoint() As tXYZ
Dim lgItem As Long
Dim aData As Variant
Dim dbR²ToX As Double
Dim dbR²ToY As Double
Dim aRegressionV() As Double
Dim aRegressionH() As Double

aData = Selection.Value2
ReDim oPoint(LBound(aData, 1) To UBound(aData, 1))
For lgItem = LBound(aData, 1) To UBound(aData, 1)
oPoint(lgItem).X = aData(lgItem, 1)
oPoint(lgItem).Y = aData(lgItem, 2)
Next lgItem

aRegressionV() = fLinearRegression(oPoint())

Erase oPoint()
Erase aData
End Sub

Public Function fLinearRegressionRange(ByRef rgRangeData As Excel.Range) As Variant
Dim oPoint() As tXYZ
Dim lgItem As Long
Dim aData As Variant

aData = rgRangeData.Value2
ReDim oPoint(LBound(aData, 1) To UBound(aData, 1))
For lgItem = LBound(aData, 1) To UBound(aData, 1)
oPoint(lgItem).X = aData(lgItem, 1)
oPoint(lgItem).Y = aData(lgItem, 2)
Next lgItem

fLinearRegressionRange = fLinearRegression(oPoint())

Erase oPoint()
Erase aData
End Function

Public Function fLinearRegression(ByRef oPoint() As tXYZ) As Double()
' http://mathworld.wolfram.com/CorrelationCoefficient.html
' Check the Anscombe's quartet (ill conditioned data)
'Y = ß0 + (ß1 * X)
Dim lgItem As Long
Dim lgPoints As Long
Dim Xmean As Double
Dim Ymean As Double
Dim SSxx As Double
Dim SSyy As Double
Dim SSxy As Double
'Dim SSR As Double
'Dim SSE As Double
Dim aReturnV(1 To 7) As Double
Dim aReturnH(1 To 7) As Double

For lgItem = LBound(oPoint) To UBound(oPoint)
Xmean = Xmean + oPoint(lgItem).X
Ymean = Ymean + oPoint(lgItem).Y
Next lgItem
lgPoints = (UBound(oPoint) - LBound(oPoint) + 1)
Xmean = Xmean / lgPoints
Ymean = Ymean / lgPoints

SSxx = -(Xmean * Xmean) * lgPoints
SSyy = -(Ymean * Ymean) * lgPoints
SSxy = -(Xmean * Ymean) * lgPoints
For lgItem = LBound(oPoint) To UBound(oPoint)
' For Correlation Coefficient
SSxx = SSxx + (oPoint(lgItem).X * oPoint(lgItem).X)
SSyy = SSyy + (oPoint(lgItem).Y * oPoint(lgItem).Y)
SSxy = SSxy + (oPoint(lgItem).X * oPoint(lgItem).Y)
Next lgItem

' Find SSR (sum of squared residuals, the quantity to minimize) and SSE (sum of squared errors)
aReturnV(1) = SSxy / SSxx ' Slope --> ß1
aReturnV(2) = Ymean - (aReturnV(1) * Xmean) ' Origin --> ß0
aReturnV(3) = SSxy ^ 2 / (SSxx * SSyy) ' Correlation Coefficient --> R²
aReturnV(4) = Xmean
aReturnV(5) = Ymean
aReturnV(6) = SSyy * (1 - aReturnV(3)) ' SSR = SSyy * (1 - R²)
aReturnV(7) = SSyy - aReturnV(6) ' SSE = ß1 * SSxy = ß1² * SSxx = SSyy - SSR

' Rotate the minimizer function 90º, to check ill conditioned data
aReturnH(1) = SSxy / SSyy ' Slope --> ß1'
aReturnH(2) = Xmean - (aReturnH(1) * Ymean) ' Origin --> ß0'

' Get inverse correlation:
aReturnH(1) = 1 / aReturnH(1) ' ß1 = 1/ß1'
aReturnH(2) = -aReturnH(2) * aReturnH(1) ' ß0 = -ß0'* ß1

aReturnH(3) = SSxy ^ 2 / (SSxx * SSyy) ' Correlation Coefficient --> R²
aReturnH(4) = Xmean
aReturnH(5) = Ymean
aReturnH(6) = SSxx * (1 - aReturnH(3)) ' SSR = SSxx * (1 - R²)
aReturnH(7) = SSxx - aReturnH(6) ' SSE = ß1 * SSxy = ß1² * SSyy = SSxx - SSR

If aReturnH(6) > aReturnV(6) Then
fLinearRegression = aReturnV()
Else
fLinearRegression = aReturnH()
End If

Erase aReturnV()
Erase aReturnH()
End Function

Public Function fEllipseXL(ByRef aSemiAxis As Double, _
ByRef bSemiAxis As Double, _
ByRef Ø As Double, _
ByRef Xc As Double, _
ByRef Yc As Double, _
Optional ByRef ß As Double, _
Optional ByRef bFull As Boolean) As Double()
Dim oCenter As tXYZ
Dim oPoint() As tXYZ
Dim aData() As Double
Dim lgPoint As Long

With oCenter
.X = Xc
.Y = Yc
End With

'oPoint() = fEllipse(aSemiAxis, bSemiAxis, Ø, oCenter, ß, bFull)
oPoint() = fEllipsePolar(aSemiAxis, bSemiAxis, Ø, oCenter, ß, bFull)
ReDim aData(LBound(oPoint) To UBound(oPoint), 1 To 2)
For lgPoint = LBound(oPoint) To UBound(oPoint)
aData(lgPoint, 1) = oPoint(lgPoint).X
aData(lgPoint, 2) = oPoint(lgPoint).Y
Next lgPoint

fEllipseXL = aData()
End Function

Public Function fEllipsePolar(ByRef aSemiAxis As Double, _
ByRef bSemiAxis As Double, _
ByRef Ø As Double, _
ByRef oCenter As tXYZ, _
Optional ByRef ß As Double = 0, _
Optional ByRef bFull As Boolean = False) As tXYZ()
' For polar angles from center

Dim lgAngle As Long
Dim lgPoint As Long
Dim dbRadius As Double
Dim dbAngle As Double
Dim oPoint() As tXYZ
Dim oPointTmp As tXYZ
Dim dbMajorAxis As Double
Dim eccentricity As Double

If bFull Then
' Fill initial values
lgPoint = g_BASE - 1

' Select case major semiaxis
If aSemiAxis >= bSemiAxis Then
eccentricity = VBA.Sqr((aSemiAxis * aSemiAxis) - (bSemiAxis * bSemiAxis)) / aSemiAxis

For lgAngle = 0 To 360 Step 10
lgPoint = lgPoint + 1
ReDim Preserve oPoint(0 To lgPoint)
dbAngle = VBA.CDbl(lgAngle * PI_RAD)
dbRadius = bSemiAxis / VBA.Sqr(1 - (eccentricity * VBA.Cos(dbAngle) ^ 2))
oPoint(lgPoint).X = dbRadius * VBA.Cos(dbAngle)
oPoint(lgPoint).Y = dbRadius * VBA.Sin(dbAngle)
Next lgAngle

ElseIf aSemiAxis < bSemiAxis Then
eccentricity = VBA.Sqr((bSemiAxis * bSemiAxis) - (aSemiAxis * aSemiAxis)) / bSemiAxis

For lgAngle = 0 To 360 Step 10
lgPoint = lgPoint + 1
ReDim Preserve oPoint(0 To lgPoint)
dbAngle = VBA.CDbl(lgAngle * PI_RAD)
dbRadius = aSemiAxis / VBA.Sqr(1 - (eccentricity * VBA.Cos(dbAngle) ^ 2))
oPoint(lgPoint).X = dbRadius * VBA.Cos(dbAngle)
oPoint(lgPoint).Y = dbRadius * VBA.Sin(dbAngle)
Next lgAngle
End If

For lgPoint = LBound(oPoint) To UBound(oPoint)
' For a rotated ellipse:
oPointTmp.X = (oPoint(lgPoint).X * Cos(Ø)) - (oPoint(lgPoint).Y * Sin(Ø)) + oCenter.X
oPointTmp.Y = (oPoint(lgPoint).X * Sin(Ø)) + (oPoint(lgPoint).Y * Cos(Ø)) + oCenter.Y
oPoint(lgPoint) = oPointTmp
Next lgPoint
Else
ReDim oPoint(0)
End If

fEllipsePolar = oPoint()

End Function

Public Function fEllipse(ByRef aSemiAxis As Double, _
ByRef bSemiAxis As Double, _
ByRef Ø As Double, _
ByRef oCenter As tXYZ, _
Optional ByRef ß As Double = 0, _
Optional ByRef bFull As Boolean = False) As tXYZ()
' https://en.wikipedia.org/wiki/Ellipse
' Cannonical equation A·X² + B·X·Y + C·Y² + D·X + E·Y + F = 0
' canonical implicit equation: Xcan²/a² + Ycan²/b² = 1
' Xcan = (X - oCenter.X) * Cos(Ø) + (Y - oCenter.Y) * Sin(Ø)
' Ycan = -(X - oCenter.X) * Sin(Ø) + (Y - oCenter.Y) * Cos(Ø)
' Parametric form: (x,y)=(a*cos t,b*sin t) 0 <= t < 2*pi
' A = aSemiAxis² * (Sin(Ø))² + bSemiAxis² * (Cos(Ø))²
' B = 2 * (bSemiAxis² - aSemiAxis²) * Sin(Ø) * Cos(Ø)
' C = aSemiAxis² * (Cos(Ø))² + bSemiAxis² * (Sin(Ø))²
' D = -2·A·oCenter.X - B·oCenter.Y
' E = -B·oCenter.X - 2·C·oCenter.Y
' F = A·oCenter.X² + B·oCenter.X·oCenter.Y + C·oCenter.Y² - A²·B²
' ð = ((A·C) - (B²/4))·F + (B·E·D/4) - ((C·D²)/4) - ((A·E²)/4)

Dim ð As Double 'discriminant for non degenerated ellipse
Dim A As Double
Dim B As Double
Dim C As Double
Dim D As Double
Dim E As Double
Dim F As Double
Dim lgSegment As Long
Dim lgPoint As Long
Dim oPoint() As tXYZ
Dim oPointTmp As tXYZ

A = (aSemiAxis * Sin(Ø)) ^ 2 + (bSemiAxis * Cos(Ø)) ^ 2
B = 2 * (bSemiAxis ^ 2 - aSemiAxis ^ 2) * Sin(Ø) * Cos(Ø)
C = (aSemiAxis * Cos(Ø)) ^ 2 + (bSemiAxis * Sin(Ø)) ^ 2
D = -2 * A * oCenter.X - B * oCenter.Y
E = -B * oCenter.X - 2 * C * oCenter.Y
F = (A * oCenter.X ^ 2) + (B * oCenter.X * oCenter.Y) + (C * oCenter.Y ^ 2) - (A * B) ^ 2

ð = ((A * C) - (B ^ 2 / 4)) * F + (B * E * D / 4) - ((C * D ^ 2) / 4) - ((A * E ^ 2) / 4)

If ð <= 0 Then ' not degenerated ellipse
If bFull Then
ReDim oPoint(0 To 40)
' Fill initial values
lgPoint = -1
For lgSegment = 0 To 19
lgPoint = lgPoint + 1
oPoint(lgPoint).X = -aSemiAxis + (aSemiAxis * lgSegment / 10)
oPoint(lgPoint).Y = bSemiAxis * VBA.Sqr(1 - (oPoint(lgPoint).X / aSemiAxis) ^ 2)
Next lgSegment

lgPoint = lgPoint + 1
oPoint(lgPoint).X = aSemiAxis
oPoint(lgPoint).Y = 0

For lgSegment = 19 To 1 Step -1
lgPoint = lgPoint + 1
oPoint(lgPoint).X = -aSemiAxis + (aSemiAxis * lgSegment / 10)
oPoint(lgPoint).Y = -bSemiAxis * VBA.Sqr((1 - (oPoint(lgPoint).X / aSemiAxis) ^ 2))
Next lgSegment

lgPoint = lgPoint + 1
oPoint(lgPoint) = oPoint(LBound(oPoint))

For lgPoint = 0 To 40
'For a rotated ellipse:
oPointTmp.X = (oPoint(lgPoint).X * Cos(Ø)) - (oPoint(lgPoint).Y * Sin(Ø)) + oCenter.X
oPointTmp.Y = (oPoint(lgPoint).X * Sin(Ø)) + (oPoint(lgPoint).Y * Cos(Ø)) + oCenter.Y
oPoint(lgPoint) = oPointTmp
Next lgPoint
Else
ReDim oPoint(0)
End If

fEllipse = oPoint()

'ElseIf ð = 0 Then 'point ellipse
' ReDim oPoint(0)
' oPoint(0) = oCenter
' fEllipse = oPoint()

End If

End Function

Private Function fDistanceToLine(ByVal px As Double, ByVal py As Double, _
ByVal X1 As Double, ByVal Y1 As Double, _
ByVal X2 As Double, ByVal Y2 As Double, _
Optional ByRef t As Double) As Double
' Calculate the distance between the point and the segment.
' http://vb-helper.com/howto_distance_point_to_line.html
' https://en.wikipedia.org/wiki/Distance_from_a_point_to_a_line
' http://mathworld.wolfram.com/Point-LineDistance2-Dimensional.html
Dim dX As Double
Dim dY As Double

dX = X2 - X1
dY = Y2 - Y1
If (dX = 0 And dY = 0) Then
' It's a point not a line segment.
dX = px - X1
dY = py - Y1
near_x = X1
near_y = Y1
fDistanceToLine = VBA.Sqr(dX * dX + dY * dY)
Exit Function
End If

' Calculate the t that minimizes the distance.
t = ((px - X1) * dX + (py - Y1) * dY) / (dX * dX + dY * dY)

' See if this represents one of the segment's end points or a point in the middle.
If t 1 Then
dX = px - X2
dY = py - Y2
near_x = X2
near_y = Y2
Else
near_x = X1 + t * dX
near_y = Y1 + t * dY
dX = px - near_x
dY = py - near_y
End If

fDistanceToLine = VBA.Sqr(dX * dX + dY * dY)
End Function

Public Function fDistance2DPoints(ByRef oPointA As tXYZ, ByRef oPointB As tXYZ) As Double
Dim dX As Double
Dim dY As Double

dX = oPointA.X - oPointB.X
dY = oPointA.Y - oPointB.Y

fDistance2DPoints = VBA.Sqr((dX * dX) + (dY * dY))
End Function

Public Sub sDouglasPeucker()
Dim lgItem As Long
Dim lgR As Long
Dim lgC As Long
Dim aData As Variant
Dim oPoint() As tXYZ
Dim oPtFilter() As tXYZ
Dim Threshold As Double

aData = Selection.Value2
ReDim oPoint(LBound(aData, 1) To UBound(aData, 1))
For lgItem = LBound(aData, 1) To UBound(aData, 1)
oPoint(lgItem).X = aData(lgItem, 1)
oPoint(lgItem).Y = aData(lgItem, 2)
Next lgItem

Threshold = VBA.Val(VBA.InputBox("Threshold value", , 1))
oPtFilter() = fDouglasPeucker(oPoint(), Threshold)

'Show in Worksheet:
With ActiveSheet
lgR = Selection.Row - 1
lgC = Selection.Column
For lgItem = LBound(oPtFilter) To UBound(oPtFilter)
lgR = lgR + 1
.Cells(lgR, lgC + 2).Value2 = oPtFilter(lgItem).X
.Cells(lgR, lgC + 3).Value2 = oPtFilter(lgItem).Y
Next lgItem
End With

Erase oPoint()
Erase aData
End Sub

Public Function fDouglasPeucker(ByRef oPoint() As tXYZ, _
Optional ByVal Threshold As Double = 0) As tXYZ()
' https://en.wikipedia.org/wiki/Ramer%E2%80%93Douglas%E2%80%93Peucker_algorithm
' Ramer–Douglas–Peucker algorithm (RDP)
' ToDo: Solve "crossing" segments (one/several vertex get trapped)...
' ToDo: if no more than one vertex get inside a segment, then if that vertex below threshold, can be deleted

Dim oLine() As tXYZ
Dim lgPoint As Long
Dim lgSegment As Long
Dim lgItem As Long
Dim lgMove As Long
Dim dbDistance As Double
Dim dbDistanceMax As Double
Dim dbCheck As Double
Dim lgBreaker As Long
Dim bClosest As Boolean
Dim dbVectorMultiplier As Double
Dim bBreaker As Boolean
Dim bIterate As Boolean

If Not (Not oPoint()) Then
If Threshold = 0 Then
fDouglasPeucker = oPoint()
Exit Function
End If

' Get K.P. for each point
oPoint(LBound(oPoint)).Z = 0
For lgPoint = (LBound(oPoint) + 1) To UBound(oPoint)
oPoint(lgPoint).Z = oPoint(lgPoint - 1).Z + fDistance2DPoints(oPoint(lgPoint), oPoint(lgPoint - 1))
Next lgPoint

lgSegment = g_BASE + 1
ReDim Preserve oLine(g_BASE + 0 To lgSegment)
oLine(g_BASE + 0) = oPoint(LBound(oPoint))
oLine(g_BASE + 1) = oPoint(UBound(oPoint))

Do
bIterate = False

lgPoint = LBound(oPoint) ' initialze with first item
For lgSegment = LBound(oLine) To UBound(oLine) - 1
dbDistanceMax = Threshold
bBreaker = False

For lgPoint = (LBound(oPoint) + 1) To (UBound(oPoint) - 1) 'Avoid extremes, as they are already in the final set
dbDistance = fDistanceToLine(oPoint(lgPoint).X, oPoint(lgPoint).Y, _
oLine(lgSegment).X, oLine(lgSegment).Y, _
oLine(lgSegment + 1).X, oLine(lgSegment + 1).Y, _
dbVectorMultiplier)
' First condition to apply only if is inside range: oLine(lgSegment).KP <= oNearestOnLine.KP = 0 And dbVectorMultiplier <= 1) Then
' Second condition is to apply only if point is the closest to any other segment...
bClosest = True ' Initialize
For lgItem = LBound(oLine) To (UBound(oLine) - 1)
If lgItem lgSegment Then ' avoid itself...
dbCheck = fDistanceToLine(oPoint(lgPoint).X, oPoint(lgPoint).Y, _
oLine(lgItem).X, oLine(lgItem).Y, _
oLine(lgItem + 1).X, oLine(lgItem + 1).Y, _
dbVectorMultiplier)
If (dbVectorMultiplier >= 0 And dbVectorMultiplier <= 1) Then ' Only if inside range...
If dbCheck < dbDistance Then
bClosest = False
Exit For
End If
End If
End If
Next lgItem

If bClosest Then
If dbDistanceMax UBound(oPoint) Then Exit For ' only if points sorted...
Next lgSegment
Loop While bIterate

fDouglasPeucker = oLine()
End If
End Function

Public Function fDouglasPeuckerRange(ByRef rgRangeData As Excel.Range, _
Optional ByVal Threshold As Double = 0) As Variant
Dim lgItem As Long
Dim aData As Variant
Dim oPoint() As tXYZ
Dim oPtFilter() As tXYZ

aData = Selection.Value2
ReDim Preserve oPoint(LBound(aData, 1) To UBound(aData, 1))
For lgItem = LBound(aData, 1) To UBound(aData, 1)
oPoint(lgItem).X = aData(lgItem, 1)
oPoint(lgItem).Y = aData(lgItem, 2)
Next lgItem

Threshold = 1
oPtFilter() = fDouglasPeucker(oPoint(), Threshold)
ReDim aData(LBound(oPtFilter) To UBound(oPtFilter), 1 To 2)
For lgItem = LBound(oPtFilter) To UBound(oPtFilter)
aData(lgItem, 1) = oPtFilter(lgItem).X
aData(lgItem, 2) = oPtFilter(lgItem).Y
Next lgItem
fDouglasPeuckerRange = aData

Erase oPoint()
Erase aData
End Function
[/sourcecode]

Leave a Reply

Your email address will not be published. Required fields are marked *