Could not have done it without the Better Programming article.
function minHash($a, $b, $shingleLength=3, [switch]$caseInsensitive=$False) {
# adapted from https://betterprogramming.pub/identify-similarities-between-sentences-in-python-e9f71d454d1d
function jaccardDistance($a, $b) {
$toTest = [System.Collections.Generic.HashSet[string]] @($b)
$intersect = [System.Collections.Generic.HashSet[string]] @($a)
$intersect.IntersectWith($toTest)
$union = [System.Collections.Generic.HashSet[string]] @($a)
$union.UnionWith($toTest)
if($union.Count -eq 0) { return $null }
return $intersect.Count / $union.Count
}
function shingles($s, $len=3, [switch]$caseInsensitive=$False) {
# courtesy https://stackoverflow.com/a/29127088
$set = New-Object System.Collections.Generic.HashSet[string]
if($caseInsensitive) { $s = $s.toUpper() }
$sArray = $s.Trim().toCharArray()
# I would hope there's a more elegant way to do this, but it works
for($i = 0; $i -le ($sArray.length - $len); $i++) {
$set.Add($sArray[$i..$($i + $len - 1)])|out-null
}
return $set
}
if($caseInsensitive) {
return jaccardDistance `
(shingles $a -len $shingleLength -caseInsensitive) `
(shingles $b -len $shingleLength -caseInsensitive)
}
return jaccardDistance `
(shingles $a -len $shingleLength) `
(shingles $b -len $shingleLength)
}