Source code for pygeo.geo_utils.remove_duplicates

# External modules
import numpy as np

# Local modules
from .norm import eDist

# --------------------------------------------------------------
#  Functions that removes duplicate entries from a list
# --------------------------------------------------------------

[docs] def unique(s): r"""Return a list of the elements in s, but without duplicates. For example, ``unique([1,2,3,1,2,3])`` is some permutation of ``[1,2,3]``, ``unique("abcabc")`` some permutation of ``["a", "b", "c"]``, and ``unique(([1, 2], [2, 3], [1, 2]))`` some permutation of ``[[2, 3], [1, 2]]``. For best speed, all sequence elements should be hashable. Then ``unique()`` will usually work in linear time. If not possible, the sequence elements should enjoy a total ordering, and if ``list(s).sort()`` doesn't raise ``TypeError`` it's assumed that they do enjoy a total ordering. Then ``unique()`` will usually work in :math:`\mathcal{O}(N\log_2(N))` time. If that's not possible either, the sequence elements must support equality-testing. Then ``unique()`` will usually work in quadratic time. """ n = len(s) if n == 0: return [] # Try using a dict first, as that's the fastest and will usually # work. If it doesn't work, it will usually fail quickly, so it # usually doesn't np.cost much to *try* it. It requires that all the # sequence elements be hashable, and support equality comparison. u = {} try: for x in s: u[x] = 1 except TypeError: pass else: return sorted(u.keys()) # We can't hash all the elements. Second fastest is to sort, # which brings the equal elements together; then duplicates are # easy to weed out in a single pass. # NOTE: Python's list.sort() was designed to be efficient in the # presence of many duplicate elements. This isn't true of all # sort functions in all languages or libraries, so this approach # is more effective in Python than it may be elsewhere. try: t = list(s) t.sort() except TypeError: pass else: assert n > 0 last = t[0] lasti = i = 1 while i < n: if t[i] != last: t[lasti] = last = t[i] lasti += 1 i += 1 return t[:lasti] # Brute force is all that's left. u = [] for x in s: if x not in u: u.append(x) return u
[docs] def uniqueIndex(s, sHash=None): """ This function is based on :meth:`unique`. The idea is to take a list s, and reduce it as per unique. However, it additionally calculates a linking index array that is the same size as the original s, and points to where it ends up in the the reduced list if sHash is not specified for sorting, s is used """ if sHash is not None: ind = np.argsort(np.argsort(sHash)) else: ind = np.argsort(np.argsort(s)) n = len(s) t = list(s) t.sort() diff = np.zeros(n, "bool") last = t[0] lasti = i = 1 while i < n: if t[i] != last: t[lasti] = last = t[i] lasti += 1 else: diff[i] = True i += 1 b = np.where(diff)[0] for i in range(n): ind[i] -= b.searchsorted(ind[i], side="right") return t[:lasti], ind
[docs] def pointReduce(points, nodeTol=1e-4): """Given a list of N points in ndim space, with possible duplicates, return a list of the unique points AND a pointer list for the original points to the reduced set""" # First points = np.array(points) N = len(points) if N == 0: return points, None dists = [] for ipt in range(N): dists.append(np.sqrt([ipt], points[ipt]))) # we need to round the distances to 8 decimals before sorting # because 2 points might have "identical" distances to the origin, # but they might differ on the 16 significant figure. As a result # the argsort might flip their order even though the elements # should not take over each other. By rounding them to 8 # significant figures, we somewhat guarantee that nodes that # have similar distances to the origin dont get shuffled # because of floating point errors dists_rounded = np.around(dists, decimals=8) # the "stable" sorting algorithm guarantees that entries # with the same values dont overtake each other. # The entries with identical distances are fully checked # in the brute force check below. ind = np.argsort(dists_rounded, kind="stable") i = 0 cont = True newPoints = [] link = np.zeros(N, "intc") linkCounter = 0 while cont: cont2 = True tempInd = [] j = i while cont2: if abs(dists[ind[i]] - dists[ind[j]]) < nodeTol: tempInd.append(ind[j]) j = j + 1 if j == N: # Overrun check cont2 = False else: cont2 = False subPoints = [] # Copy of the list of sub points with the dists for ii in range(len(tempInd)): subPoints.append(points[tempInd[ii]]) # Brute Force Search them subUniquePts, subLink = pointReduceBruteForce(subPoints, nodeTol) newPoints.extend(subUniquePts) for ii in range(len(tempInd)): link[tempInd[ii]] = subLink[ii] + linkCounter linkCounter += max(subLink) + 1 i = j - 1 + 1 if i == N: cont = False return np.array(newPoints), np.array(link)
[docs] def pointReduceBruteForce(points, nodeTol=1e-4): """Given a list of N points in ndim space, with possible duplicates, return a list of the unique points AND a pointer list for the original points to the reduced set Warnings -------- This is the brute force version of :func:`pointReduce`. """ N = len(points) if N == 0: return points, None uniquePoints = [points[0]] link = [0] for i in range(1, N): foundIt = False for j in range(len(uniquePoints)): if eDist(points[i], uniquePoints[j]) < nodeTol: link.append(j) foundIt = True break if not foundIt: uniquePoints.append(points[i]) link.append(j + 1) return np.array(uniquePoints), np.array(link)