Could not have done it without the Better Programming article.
function minHash($a, $b, $shingleLength=3, [switch]$caseInsensitive=$False) { # adapted from https://betterprogramming.pub/identify-similarities-between-sentences-in-python-e9f71d454d1d function jaccardDistance($a, $b) { $toTest = [System.Collections.Generic.HashSet[string]] @($b) $intersect = [System.Collections.Generic.HashSet[string]] @($a) $intersect.IntersectWith($toTest) $union = [System.Collections.Generic.HashSet[string]] @($a) $union.UnionWith($toTest) if($union.Count -eq 0) { return $null } return $intersect.Count / $union.Count } function shingles($s, $len=3, [switch]$caseInsensitive=$False) { # courtesy https://stackoverflow.com/a/29127088 $set = New-Object System.Collections.Generic.HashSet[string] if($caseInsensitive) { $s = $s.toUpper() } $sArray = $s.Trim().toCharArray() # I would hope there's a more elegant way to do this, but it works for($i = 0; $i -le ($sArray.length - $len); $i++) { $set.Add($sArray[$i..$($i + $len - 1)])|out-null } return $set } if($caseInsensitive) { return jaccardDistance ` (shingles $a -len $shingleLength -caseInsensitive) ` (shingles $b -len $shingleLength -caseInsensitive) } return jaccardDistance ` (shingles $a -len $shingleLength) ` (shingles $b -len $shingleLength) }