DataMerger fails when merging indices with different types
Created by: gkirgizov
Failing example. Here it's seen that str
index can't be merged with int
index. Possibly this would also fail for two non-numeric indices.
2022-07-28T15:17:14.1922292Z test/unit/utilities/test_project_import_export.py:88:
2022-07-28T15:17:14.1922714Z _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
2022-07-28T15:17:14.1923082Z fedot/api/main.py:164: in fit
2022-07-28T15:17:14.1923533Z self._train_pipeline_on_full_dataset(recommendations, full_train_not_preprocessed)
2022-07-28T15:17:14.1924029Z fedot/api/main.py:422: in _train_pipeline_on_full_dataset
2022-07-28T15:17:14.1924537Z n_jobs=self.params.api_params['n_jobs'],
2022-07-28T15:17:14.1924958Z fedot/core/pipelines/pipeline.py:161: in fit
2022-07-28T15:17:14.1925366Z use_fitted_operations=use_fitted)
2022-07-28T15:17:14.1925818Z fedot/core/pipelines/pipeline.py:116: in _fit
2022-07-28T15:17:14.1926265Z train_predicted = self.root_node.fit(input_data=input_data)
2022-07-28T15:17:14.1926664Z fedot/core/pipelines/node.py:335: in fit
2022-07-28T15:17:14.1927276Z secondary_input = self._input_from_parents(input_data=input_data, parent_operation='fit')
2022-07-28T15:17:14.1927789Z fedot/core/pipelines/node.py:363: in _input_from_parents
2022-07-28T15:17:14.1928185Z parent_operation)
2022-07-28T15:17:14.1928575Z fedot/core/pipelines/node.py:402: in _combine_parents
2022-07-28T15:17:14.1929017Z prediction = parent.fit(input_data=input_data)
2022-07-28T15:17:14.1929430Z fedot/core/pipelines/node.py:335: in fit
2022-07-28T15:17:14.1930029Z secondary_input = self._input_from_parents(input_data=input_data, parent_operation='fit')
2022-07-28T15:17:14.1930525Z fedot/core/pipelines/node.py:363: in _input_from_parents
2022-07-28T15:17:14.1930915Z parent_operation)
2022-07-28T15:17:14.1931297Z fedot/core/pipelines/node.py:402: in _combine_parents
2022-07-28T15:17:14.1931738Z prediction = parent.fit(input_data=input_data)
2022-07-28T15:17:14.1932151Z fedot/core/pipelines/node.py:335: in fit
2022-07-28T15:17:14.1932749Z secondary_input = self._input_from_parents(input_data=input_data, parent_operation='fit')
2022-07-28T15:17:14.1933247Z fedot/core/pipelines/node.py:365: in _input_from_parents
2022-07-28T15:17:14.1933699Z secondary_input = DataMerger.get(parent_results).merge()
2022-07-28T15:17:14.1934152Z fedot/core/data/merge/data_merger.py:56: in get
2022-07-28T15:17:14.1934528Z return cls(outputs, data_type)
2022-07-28T15:17:14.1934935Z fedot/core/data/merge/data_merger.py:30: in __init__
2022-07-28T15:17:14.1935369Z self.common_indices = find_common_elements(*idx_list)
2022-07-28T15:17:14.1935822Z fedot/core/data/array_utilities.py:10: in find_common_elements
2022-07-28T15:17:14.1936405Z common_elements = reduce(np.intersect1d, indices[1:], indices[0])
2022-07-28T15:17:14.1936873Z <__array_function__ internals>:6: in intersect1d
2022-07-28T15:17:14.1937204Z ???
2022-07-28T15:17:14.1937537Z _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
2022-07-28T15:17:14.1937801Z
2022-07-28T15:17:14.1938176Z ar1 = array(['Alpen gold', 'Alpen gold1', 'Rossia shedraya dusha',
2022-07-28T15:17:14.1938848Z 'Rossia shedraya dusha1', 'Shipuchka', 'Shipuchka1...acks1', 'Werthers Original Caramel',
2022-07-28T15:17:14.1939461Z 'Werthers Original Caramel1', 'Whoppers', 'Whoppers1'],
2022-07-28T15:17:14.1939851Z dtype=object)
2022-07-28T15:17:14.1940285Z ar2 = array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
2022-07-28T15:17:14.1940616Z 17, 18, 19])
2022-07-28T15:17:14.1940982Z assume_unique = False, return_indices = False
2022-07-28T15:17:14.1941265Z
2022-07-28T15:17:14.1941512Z @array_function_dispatch(_intersect1d_dispatcher)
2022-07-28T15:17:14.1941956Z def intersect1d(ar1, ar2, assume_unique=False, return_indices=False):
2022-07-28T15:17:14.1942422Z """
2022-07-28T15:17:14.1942780Z Find the intersection of two arrays.
2022-07-28T15:17:14.1943124Z
2022-07-28T15:17:14.1943516Z Return the sorted, unique values that are in both of the input arrays.
2022-07-28T15:17:14.1943892Z
2022-07-28T15:17:14.1944191Z Parameters
2022-07-28T15:17:14.1944546Z ----------
2022-07-28T15:17:14.1944881Z ar1, ar2 : array_like
2022-07-28T15:17:14.1945280Z Input arrays. Will be flattened if not already 1D.
2022-07-28T15:17:14.1945678Z assume_unique : bool
2022-07-28T15:17:14.1946097Z If True, the input arrays are both assumed to be unique, which
2022-07-28T15:17:14.1946562Z can speed up the calculation. Default is False.
2022-07-28T15:17:14.1946936Z return_indices : bool
2022-07-28T15:17:14.1947352Z If True, the indices which correspond to the intersection of the two
2022-07-28T15:17:14.1947857Z arrays are returned. The first instance of a value is used if there are
2022-07-28T15:17:14.1948299Z multiple. Default is False.
2022-07-28T15:17:14.1948636Z
2022-07-28T15:17:14.1949000Z .. versionadded:: 1.15.0
2022-07-28T15:17:14.1949310Z
2022-07-28T15:17:14.1949600Z Returns
2022-07-28T15:17:14.1949957Z -------
2022-07-28T15:17:14.1950290Z intersect1d : ndarray
2022-07-28T15:17:14.1950844Z Sorted 1D array of common and unique elements.
2022-07-28T15:17:14.1951220Z comm1 : ndarray
2022-07-28T15:17:14.1951622Z The indices of the first occurrences of the common values in `ar1`.
2022-07-28T15:17:14.1952092Z Only provided if `return_indices` is True.
2022-07-28T15:17:14.1952465Z comm2 : ndarray
2022-07-28T15:17:14.1952863Z The indices of the first occurrences of the common values in `ar2`.
2022-07-28T15:17:14.1953327Z Only provided if `return_indices` is True.
2022-07-28T15:17:14.1953749Z
2022-07-28T15:17:14.1954015Z
2022-07-28T15:17:14.1954298Z See Also
2022-07-28T15:17:14.1954665Z --------
2022-07-28T15:17:14.1955087Z numpy.lib.arraysetops : Module with a number of other functions for
2022-07-28T15:17:14.1955566Z performing set operations on arrays.
2022-07-28T15:17:14.1955907Z
2022-07-28T15:17:14.1956201Z Examples
2022-07-28T15:17:14.1956552Z --------
2022-07-28T15:17:14.1956909Z >>> np.intersect1d([1, 3, 4, 3], [3, 1, 2, 1])
2022-07-28T15:17:14.1957404Z array([1, 3])
2022-07-28T15:17:14.1958062Z
2022-07-28T15:17:14.1958418Z To intersect more than two arrays, use functools.reduce:
2022-07-28T15:17:14.1958687Z
2022-07-28T15:17:14.1958903Z >>> from functools import reduce
2022-07-28T15:17:14.1959204Z >>> reduce(np.intersect1d, ([1, 3, 4, 3], [3, 1, 2, 1], [6, 3, 4, 2]))
2022-07-28T15:17:14.1959650Z array([3])
2022-07-28T15:17:14.1959849Z
2022-07-28T15:17:14.1960105Z To return the indices of the values common to the input arrays
2022-07-28T15:17:14.1960415Z along with the intersected values:
2022-07-28T15:17:14.1960641Z
2022-07-28T15:17:14.1960836Z >>> x = np.array([1, 1, 2, 3, 4])
2022-07-28T15:17:14.1961075Z >>> y = np.array([2, 1, 4, 6])
2022-07-28T15:17:14.1961369Z >>> xy, x_ind, y_ind = np.intersect1d(x, y, return_indices=True)
2022-07-28T15:17:14.1961628Z >>> x_ind, y_ind
2022-07-28T15:17:14.1961860Z (array([0, 2, 4]), array([1, 0, 2]))
2022-07-28T15:17:14.1962093Z >>> xy, x[x_ind], y[y_ind]
2022-07-28T15:17:14.1962332Z (array([1, 2, 4]), array([1, 2, 4]), array([1, 2, 4]))
2022-07-28T15:17:14.1962549Z
2022-07-28T15:17:14.1962724Z """
2022-07-28T15:17:14.1962921Z ar1 = np.asanyarray(ar1)
2022-07-28T15:17:14.1963156Z ar2 = np.asanyarray(ar2)
2022-07-28T15:17:14.1963358Z
2022-07-28T15:17:14.1963553Z if not assume_unique:
2022-07-28T15:17:14.1963854Z if return_indices:
2022-07-28T15:17:14.1964115Z ar1, ind1 = unique(ar1, return_index=True)
2022-07-28T15:17:14.1964403Z ar2, ind2 = unique(ar2, return_index=True)
2022-07-28T15:17:14.1964618Z else:
2022-07-28T15:17:14.1964825Z ar1 = unique(ar1)
2022-07-28T15:17:14.1965042Z ar2 = unique(ar2)
2022-07-28T15:17:14.1965233Z else:
2022-07-28T15:17:14.1965433Z ar1 = ar1.ravel()
2022-07-28T15:17:14.1965646Z ar2 = ar2.ravel()
2022-07-28T15:17:14.1965829Z
2022-07-28T15:17:14.1966048Z aux = np.concatenate((ar1, ar2))
2022-07-28T15:17:14.1966290Z if return_indices:
2022-07-28T15:17:14.1966764Z aux_sort_indices = np.argsort(aux, kind='mergesort')
2022-07-28T15:17:14.1967042Z aux = aux[aux_sort_indices]
2022-07-28T15:17:14.1967256Z else:
2022-07-28T15:17:14.1967442Z > aux.sort()
2022-07-28T15:17:14.1967829Z E TypeError: '<' not supported between instances of 'int' and 'str'
2022-07-28T15:17:14.1968036Z