finished link dataframes

1d2295eb · Lukas Eller · d3076dd4 · 1d2295eb · 1d2295eb · 1d2295eb
Commit 1d2295eb authored Feb 19, 2021 by Lukas Eller
39 changed files
--- a/measprocess/__init__.py
+++ b/measprocess/__init__.py
-from .preprocess import synchronize_df
+from .preprocess import link_dataframes
--- a/measprocess/preprocess.py
+++ b/measprocess/preprocess.py
 import pandas as pd
+import itertools
+from pandas.api.types import is_numeric_dtype

-def synchronize_df(majority_df, minority_df, column, fk_entry, merge_columns):
+def link_dataframes(A: pd.DataFrame, B: pd.DataFrame, ref_col: str, unique_col=None, metric=None) -> pd.DataFrame:
    '''
-    Synchronize Dataframes along a column
+    Merge two DataFrames A and B according to the reference colum based on minimum metric.

-    Example:
-    Synchronize Measurements and GPS - Append merge columns to the final dataframe
-    synchronize_df(meas, gps, 'Datetime', 'fk_gps', ['Lon.', 'Lat.', 'Quality'])
+    :param ref_col: Reference Column to merge dataframes. Has to exist in both frames
+    :param metric: Metric used to determine matches in ref_col. Default lambda a, b: (a - b).abs()
+    :param unique_per_index_col: Is needed for scanner where
+
+    :return: Single merged dataframe consisting of a multiindex from A and B
    '''

-    majority[fk_entry] = 0
+    #Check if the Reference column is unique
+    if not len(A[ref_col].unique()) == len(A[ref_col]) and not unique_col:
+        raise ValueError("Duplicates in ref_col of dataframe A - set unique col")
+
+    try:
+        A[ref_col].iloc[0] - B[ref_col].iloc[0]
+    except Exception:
+        raise ValueError("Reference columns has to be numeric")
+
+    #Generate help DataFrame by merging
+    merged_AB =pd.merge(
+        A.assign(merge_key=1),
+        B.assign(merge_key=1),
+        on='merge_key',
+        suffixes=('_A','_B')
+    )
+
+    #Use absolute Distance as Default Metric
+    if not metric:
+        metric = lambda a, b: (a - b).abs()
+
+    #Needed because in measurement dataframes from scanner indices and timestamps are not unique
+    groupby_keys = [f"{ref_col}_A"]
+    if unique_col:
+        groupby_keys.append(unique_col)

-    for col in merge_columns:
-        majority[col] = 0
+    #Keep the minimum distance entry for each group
+    merged_AB = merged_AB.groupby(by=groupby_keys, axis=0, sort=False).apply(
+        lambda grouped: grouped.loc[
+            metric(
+                grouped[f'{ref_col}_A'], grouped[f'{ref_col}_B']
+            ).idxmin()
+        ]
+    )

-    c = 0
-    for i, meas in majority.iterrows():
-        c += 1
-        print(f"{c} / {len(majority)}", end='\r')
+    #Remove help columns and set index of merged dataframe to index of A
+    merged_AB.drop(columns=['merge_key'], inplace=True)
+    merged_AB.reset_index(drop="True", inplace=True)
+    merged_AB.set_index(A.index, inplace=True)

-        sol = (meas[column] - minority[column]).abs().argmin()
-        majority.loc[i, fk_entry] = minority.iloc[sol].name
-        for col in merge_columns:
-            majority.loc[i, col] = minority.iloc[sol][col]
+    #Generate a Multiindex to seperate columns from dataframes A and B
+    merged_AB.columns = pd.MultiIndex.from_tuples((
+        itertools.chain(
+            zip( len(A.columns)*['A'], A.columns ),
+            zip( len(B.columns)*['B'], B.columns )
+        )
+    ))

-    return majority
+    return merged_AB
--- a/playground.py
+++ b/playground.py
+import pandas as pd
+
+import warnings
+
+import os
+import pickle
+
+
+#Assume that A is the longer df
+
+
+
+
+meas
+
+combined = link_dataframes(A, B, 'ref')
+
+all(combined['A'] == A)
+
+merged = link_dataframes(meas, gps, 'Datetime', unique_col="PCI")
--- a/test_merge.csv
+++ b/test_merge.csv
--- a/tests/example_files/scanner/#bf.p0
+++ b/tests/example_files/scanner/#bf.p0
--- a/tests/example_files/scanner/#btstemplate.p0
+++ b/tests/example_files/scanner/#btstemplate.p0
--- a/tests/example_files/scanner/#dc.p0
+++ b/tests/example_files/scanner/#dc.p0
--- a/tests/example_files/scanner/#dl.p0
+++ b/tests/example_files/scanner/#dl.p0
--- a/tests/example_files/scanner/#dn.p0
+++ b/tests/example_files/scanner/#dn.p0
--- a/tests/example_files/scanner/#ds.p0
+++ b/tests/example_files/scanner/#ds.p0
--- a/tests/example_files/scanner/#dt.p0
+++ b/tests/example_files/scanner/#dt.p0
--- a/tests/example_files/scanner/#ff.p0
+++ b/tests/example_files/scanner/#ff.p0
--- a/tests/example_files/scanner/#floorplan.p0
+++ b/tests/example_files/scanner/#floorplan.p0
--- a/tests/example_files/scanner/#hash.p0
+++ b/tests/example_files/scanner/#hash.p0
--- a/tests/example_files/scanner/#hv.p0
+++ b/tests/example_files/scanner/#hv.p0
--- a/tests/example_files/scanner/#hw.p0
+++ b/tests/example_files/scanner/#hw.p0
--- a/tests/example_files/scanner/#id.p0
+++ b/tests/example_files/scanner/#id.p0
--- a/tests/example_files/scanner/#license.p0
+++ b/tests/example_files/scanner/#license.p0
--- a/tests/example_files/scanner/#licenseregion.p0
+++ b/tests/example_files/scanner/#licenseregion.p0
--- a/tests/example_files/scanner/#mm.p0
+++ b/tests/example_files/scanner/#mm.p0
--- a/tests/example_files/scanner/#nn.p0
+++ b/tests/example_files/scanner/#nn.p0
--- a/tests/example_files/scanner/#product.p0
+++ b/tests/example_files/scanner/#product.p0
--- a/tests/example_files/scanner/#project.p0
+++ b/tests/example_files/scanner/#project.p0
--- a/tests/example_files/scanner/#server.p0
+++ b/tests/example_files/scanner/#server.p0
--- a/tests/example_files/scanner/#serverhw.p0
+++ b/tests/example_files/scanner/#serverhw.p0
--- a/tests/example_files/scanner/#start.p0
+++ b/tests/example_files/scanner/#start.p0
--- a/tests/example_files/scanner/#stop.p0
+++ b/tests/example_files/scanner/#stop.p0
--- a/tests/example_files/scanner/#sw.p0
+++ b/tests/example_files/scanner/#sw.p0
--- a/tests/example_files/scanner/#unitid.p0
+++ b/tests/example_files/scanner/#unitid.p0
--- a/tests/example_files/scanner/#ut.p0
+++ b/tests/example_files/scanner/#ut.p0
--- a/tests/example_files/scanner/devi.p0
+++ b/tests/example_files/scanner/devi.p0
--- a/tests/example_files/scanner/gps.p0
+++ b/tests/example_files/scanner/gps.p0
--- a/tests/example_files/scanner/ofdmscan.p0
+++ b/tests/example_files/scanner/ofdmscan.p0
--- a/tests/example_files/scanner/scanconfig.p0
+++ b/tests/example_files/scanner/scanconfig.p0
--- a/tests/example_files/scanner/startscan.p0
+++ b/tests/example_files/scanner/startscan.p0
--- a/tests/example_files/scanner/stopscan.p0
+++ b/tests/example_files/scanner/stopscan.p0
--- a/tests/example_files/scanner_1/gps_example.csv
+++ b/tests/example_files/scanner_1/gps_example.csv
+,Datetime,Lon.,Lat.,Height,Distance,Quality,Satellites,Velocity,PDOP,HDOP,VDOP,Status
+28,2020-12-14 23:10:04.249,16.370825,48.19643,213,0,1,8,0,,,,
+29,2020-12-14 23:10:05.249,16.370825,48.19643,213,0,1,8,0,,,,
+30,2020-12-14 23:10:06.253,16.370825,48.19643,213,0,1,8,0,,,,
+31,2020-12-14 23:10:07.272,16.370825,48.19643,213,0,1,8,0,,,,
+32,2020-12-14 23:10:08.249,16.370825,48.19643,212,0,1,8,0,,,,
+33,2020-12-14 23:10:09.250,16.370825,48.19643,212,0,1,8,0,,,,
+34,2020-12-14 23:10:10.249,16.370827,48.196434,212,0,1,8,0,,,,
+35,2020-12-14 23:10:11.249,16.370827,48.196434,212,0,1,8,0,,,,
+36,2020-12-14 23:10:12.251,16.370829,48.196434,212,0,1,8,0,,,,
+37,2020-12-14 23:10:13.273,16.370829,48.196434,211,0,1,8,0,,,,
--- a/tests/example_files/scanner_1/measurement_example.csv
+++ b/tests/example_files/scanner_1/measurement_example.csv
+,Datetime,PCI,CP,Cell TX antennas,RSRP,RSRQ,CINR,Time offset,Delay spread,Indication,MIMO mode,RSSI,CFO
+28,2020-12-14 23:10:04.335,366,1,1,-86.4,-12.9,7.4,13188,14,0,,-56.5,
+28,2020-12-14 23:10:04.335,368,1,1,-91.4,-17.9,-6.8,13192,10,0,,-56.5,
+30,2020-12-14 23:10:04.351,339,1,1,-85.2,-12.2,22.1,287554,10,0,,-54.3,
+32,2020-12-14 23:10:04.372,366,1,1,-86.5,-12.6,16.9,13188,14,0,,-56.9,
+32,2020-12-14 23:10:04.372,368,1,1,-91.6,-17.8,-6.7,13192,12,0,,-56.9,
+34,2020-12-14 23:10:04.387,339,1,1,-85.2,-12.3,21.3,287554,10,0,,-54.1,
+36,2020-12-14 23:10:04.402,339,1,1,-85.3,-12.2,22.0,287554,10,0,,-54.3,
+38,2020-12-14 23:10:04.423,368,1,1,-91.9,-18.1,-6.9,13192,10,0,,-56.9,
+38,2020-12-14 23:10:04.423,366,1,1,-86.5,-12.6,13.7,13188,14,0,,-56.9,
+40,2020-12-14 23:10:04.439,339,1,1,-85.2,-12.2,21.6,287554,10,0,,-54.2,
+42,2020-12-14 23:10:04.456,366,1,1,-86.5,-12.6,14.8,13188,14,0,,-56.9,
+42,2020-12-14 23:10:04.456,368,1,1,-91.6,-17.7,-6.2,13192,16,0,,-56.9,
+44,2020-12-14 23:10:04.470,339,1,1,-85.2,-12.2,21.9,287554,10,0,,-54.3,
+46,2020-12-14 23:10:04.489,368,1,1,-91.9,-18.0,-6.4,13192,10,0,,-56.9,
+46,2020-12-14 23:10:04.489,366,1,1,-86.5,-12.6,12.6,13188,14,0,,-56.9,
+48,2020-12-14 23:10:04.551,366,1,1,-86.5,-13.5,3.9,13188,14,0,,-56.0,
+48,2020-12-14 23:10:04.551,368,1,1,-91.7,-18.7,-7.1,13192,10,0,,-56.0,
+50,2020-12-14 23:10:04.600,339,1,1,-85.3,-12.2,21.7,287554,10,0,,-54.3,
+52,2020-12-14 23:10:04.662,246,1,1,-105.2,-28.6,-16.6,252207,1,0,,-56.6,
+52,2020-12-14 23:10:04.662,247,1,1,-86.1,-9.5,16.1,252207,6,0,,-56.6,
+54,2020-12-14 23:10:04.718,242,1,1,-101.3,-32.3,-20.8,287552,0,0,,-50.3,
+54,2020-12-14 23:10:04.718,339,1,1,-81.4,-12.4,20.4,287549,10,0,,-50.3,
+56,2020-12-14 23:10:04.878,24,1,1,-113.3,-19.2,-9.5,13250,10,0,,-74.1,
+56,2020-12-14 23:10:04.878,363,1,1,-114.3,-20.2,-10.6,13223,6,0,,-74.1,
+56,2020-12-14 23:10:04.878,194,1,1,-112.4,-18.3,-5.6,13196,33,0,,-74.1,
+56,2020-12-14 23:10:04.878,394,1,1,-115.2,-21.4,-9.0,227924,21,0,,-73.8,
+56,2020-12-14 23:10:04.878,26,1,1,-112.9,-18.8,-6.3,13250,7,0,,-74.1,
+56,2020-12-14 23:10:04.878,355,1,1,-114.8,-20.7,-8.5,13212,27,0,,-74.1,
+56,2020-12-14 23:10:04.878,285,1,1,-114.8,-20.7,-11.1,13251,6,0,,-74.1,
+56,2020-12-14 23:10:04.878,393,1,1,-105.8,-12.0,0.3,227926,24,0,,-73.8,
+58,2020-12-14 23:10:05.018,393,1,1,-108.4,-13.4,-2.2,227924,20,0,,-76.3,
+58,2020-12-14 23:10:05.018,194,1,1,-118.7,-23.5,-11.1,13189,31,0,,-76.4,
+58,2020-12-14 23:10:05.018,394,1,1,-113.9,-18.8,-6.2,227925,22,0,,-76.3,
+58,2020-12-14 23:10:05.018,369,1,1,-118.4,-23.1,-13.7,13276,8,0,,-76.5,
+58,2020-12-14 23:10:05.018,24,1,1,-110.1,-14.9,-4.3,13251,9,0,,-76.5,
+58,2020-12-14 23:10:05.018,26,1,1,-116.3,-21.1,-8.7,13250,7,0,,-76.5,
+58,2020-12-14 23:10:05.018,285,1,1,-117.9,-22.7,-13.2,13251,1,0,,-76.5,
+58,2020-12-14 23:10:05.018,355,1,1,-116.0,-20.8,-8.3,13233,28,0,,-76.5,
+60,2020-12-14 23:10:05.075,366,1,1,-74.0,-13.4,6.2,13194,10,0,,-40.6,
+60,2020-12-14 23:10:05.075,368,1,1,-82.9,-22.3,-11.6,13191,10,0,,-40.6,
--- a/tests/preprocess_test.py
+++ b/tests/preprocess_test.py
 import unittest
+import pandas as pd
+
 from context import measprocess as mpc

-import os
-import pickle
-
-def extract_scan(path):
-    filepath = os.path.join(path, 'ofdmscan.p0')
-    with open(filepath, "rb") as fh:
-        data = pickle.load(fh)
-        return data[0][7][1]['rec'][0], data[0][7][1]['ot']['Band'], data[0][7]['ot']['Ch']
-
-def extract_gps(path):
-    filepath = os.path.join(path, 'gps.p0')
-    with open(filepath, "rb") as fh:
-        data = pickle.load(fh)
-        return data[0]['ot']
-
-class TestSum(unittest.TestCase):
-    def test_synchronize(self):
-        path = [
-            f"tests/example_files/",
-            'scanner'
-        ]
-
-        meas, band, channel  = extract_scan(
-            os.path.join(*path)
-        )
+class TestLinkFrames(unittest.TestCase):
+    def setUp(self):
+        #Init the Dummy Example
+        A = pd.DataFrame({'ref':[10,20,30, 31, 5], 'info': [5, 4, 1, 2, 1]})
+        A.index = range(2, 7)

-        meas['band'] = band
-        meas['channel'] = channel
+        B = pd.DataFrame({'ref':[15,17,27, 4], 'info': [1, 2, 3, 4], 'yo': [4, 6, 1, 8]})
+        B.index = range(9, 13)

-        gps = extract_gps(
-            os.path.join(*path)
-        )
+        self._A, self._B = A, B
+
+        self._meas = pd.read_csv("tests/example_files/scanner_1/measurement_example.csv", index_col=0)
+        self._gps = pd.read_csv("tests/example_files/scanner_1/gps_example.csv", index_col=0)

-        merged = mpc.preprocess.synchronize_df(meas, gps, 'Datetime', 'fk_gps', ['Lon.', 'Lat.', 'Quality'])
+    def test_multiindex_dummy(self):
+        combined = mpc.preprocess.link_dataframes(self._A, self._B, ref_col="ref")

-        print(merged)
+        self.assertTrue(
+            all(self._A == combined['A'])
+        )
+
+    def test_real_numeric_type(self):
+        with self.assertRaises(ValueError):
+            combined = mpc.preprocess.link_dataframes(
+                self._meas,
+                self._gps,
+                "Datetime",
+                unique_col="PCI"
+            )
+
+    def test_multiindex_real(self):
+        meas, gps = self._meas, self._gps
+        meas['Datetime'] = pd.to_datetime(meas['Datetime'])
+        gps['Datetime'] = pd.to_datetime(gps['Datetime'])
+
+        print(gps['Datetime'])
+
+        combined = mpc.preprocess.link_dataframes(
+            meas,
+            gps,
+            "Datetime",
+            unique_col="PCI"
+        )

-        #self.assertEqual(sum([1, 2, 3]), 6, "Should be 6")
+        self.assertTrue(
+            all(meas == combined['A'])
+        )

-    #def test_sum_tuple(self):
-    #    self.assertEqual(sum((1, 2, 2)), 6, "Should be 6")
+    def test_duplicates_dummy(self):
+        with self.assertRaises(ValueError):
+            mpc.preprocess.link_dataframes(self._A, self._B, ref_col="info")

 if __name__ == '__main__':
    unittest.main()