Commit 1d2295eb authored by Lukas Eller's avatar Lukas Eller

finished link dataframes

parent d3076dd4
from .preprocess import synchronize_df
from .preprocess import link_dataframes
import pandas as pd
import itertools
from pandas.api.types import is_numeric_dtype
def synchronize_df(majority_df, minority_df, column, fk_entry, merge_columns):
def link_dataframes(A: pd.DataFrame, B: pd.DataFrame, ref_col: str, unique_col=None, metric=None) -> pd.DataFrame:
'''
Synchronize Dataframes along a column
Merge two DataFrames A and B according to the reference colum based on minimum metric.
Example:
Synchronize Measurements and GPS - Append merge columns to the final dataframe
synchronize_df(meas, gps, 'Datetime', 'fk_gps', ['Lon.', 'Lat.', 'Quality'])
:param ref_col: Reference Column to merge dataframes. Has to exist in both frames
:param metric: Metric used to determine matches in ref_col. Default lambda a, b: (a - b).abs()
:param unique_per_index_col: Is needed for scanner where
:return: Single merged dataframe consisting of a multiindex from A and B
'''
majority[fk_entry] = 0
#Check if the Reference column is unique
if not len(A[ref_col].unique()) == len(A[ref_col]) and not unique_col:
raise ValueError("Duplicates in ref_col of dataframe A - set unique col")
try:
A[ref_col].iloc[0] - B[ref_col].iloc[0]
except Exception:
raise ValueError("Reference columns has to be numeric")
#Generate help DataFrame by merging
merged_AB =pd.merge(
A.assign(merge_key=1),
B.assign(merge_key=1),
on='merge_key',
suffixes=('_A','_B')
)
#Use absolute Distance as Default Metric
if not metric:
metric = lambda a, b: (a - b).abs()
#Needed because in measurement dataframes from scanner indices and timestamps are not unique
groupby_keys = [f"{ref_col}_A"]
if unique_col:
groupby_keys.append(unique_col)
for col in merge_columns:
majority[col] = 0
#Keep the minimum distance entry for each group
merged_AB = merged_AB.groupby(by=groupby_keys, axis=0, sort=False).apply(
lambda grouped: grouped.loc[
metric(
grouped[f'{ref_col}_A'], grouped[f'{ref_col}_B']
).idxmin()
]
)
c = 0
for i, meas in majority.iterrows():
c += 1
print(f"{c} / {len(majority)}", end='\r')
#Remove help columns and set index of merged dataframe to index of A
merged_AB.drop(columns=['merge_key'], inplace=True)
merged_AB.reset_index(drop="True", inplace=True)
merged_AB.set_index(A.index, inplace=True)
sol = (meas[column] - minority[column]).abs().argmin()
majority.loc[i, fk_entry] = minority.iloc[sol].name
for col in merge_columns:
majority.loc[i, col] = minority.iloc[sol][col]
#Generate a Multiindex to seperate columns from dataframes A and B
merged_AB.columns = pd.MultiIndex.from_tuples((
itertools.chain(
zip( len(A.columns)*['A'], A.columns ),
zip( len(B.columns)*['B'], B.columns )
)
))
return majority
return merged_AB
import pandas as pd
import warnings
import os
import pickle
#Assume that A is the longer df
meas
combined = link_dataframes(A, B, 'ref')
all(combined['A'] == A)
merged = link_dataframes(meas, gps, 'Datetime', unique_col="PCI")
This diff is collapsed.
,Datetime,Lon.,Lat.,Height,Distance,Quality,Satellites,Velocity,PDOP,HDOP,VDOP,Status
28,2020-12-14 23:10:04.249,16.370825,48.19643,213,0,1,8,0,,,,
29,2020-12-14 23:10:05.249,16.370825,48.19643,213,0,1,8,0,,,,
30,2020-12-14 23:10:06.253,16.370825,48.19643,213,0,1,8,0,,,,
31,2020-12-14 23:10:07.272,16.370825,48.19643,213,0,1,8,0,,,,
32,2020-12-14 23:10:08.249,16.370825,48.19643,212,0,1,8,0,,,,
33,2020-12-14 23:10:09.250,16.370825,48.19643,212,0,1,8,0,,,,
34,2020-12-14 23:10:10.249,16.370827,48.196434,212,0,1,8,0,,,,
35,2020-12-14 23:10:11.249,16.370827,48.196434,212,0,1,8,0,,,,
36,2020-12-14 23:10:12.251,16.370829,48.196434,212,0,1,8,0,,,,
37,2020-12-14 23:10:13.273,16.370829,48.196434,211,0,1,8,0,,,,
,Datetime,PCI,CP,Cell TX antennas,RSRP,RSRQ,CINR,Time offset,Delay spread,Indication,MIMO mode,RSSI,CFO
28,2020-12-14 23:10:04.335,366,1,1,-86.4,-12.9,7.4,13188,14,0,,-56.5,
28,2020-12-14 23:10:04.335,368,1,1,-91.4,-17.9,-6.8,13192,10,0,,-56.5,
30,2020-12-14 23:10:04.351,339,1,1,-85.2,-12.2,22.1,287554,10,0,,-54.3,
32,2020-12-14 23:10:04.372,366,1,1,-86.5,-12.6,16.9,13188,14,0,,-56.9,
32,2020-12-14 23:10:04.372,368,1,1,-91.6,-17.8,-6.7,13192,12,0,,-56.9,
34,2020-12-14 23:10:04.387,339,1,1,-85.2,-12.3,21.3,287554,10,0,,-54.1,
36,2020-12-14 23:10:04.402,339,1,1,-85.3,-12.2,22.0,287554,10,0,,-54.3,
38,2020-12-14 23:10:04.423,368,1,1,-91.9,-18.1,-6.9,13192,10,0,,-56.9,
38,2020-12-14 23:10:04.423,366,1,1,-86.5,-12.6,13.7,13188,14,0,,-56.9,
40,2020-12-14 23:10:04.439,339,1,1,-85.2,-12.2,21.6,287554,10,0,,-54.2,
42,2020-12-14 23:10:04.456,366,1,1,-86.5,-12.6,14.8,13188,14,0,,-56.9,
42,2020-12-14 23:10:04.456,368,1,1,-91.6,-17.7,-6.2,13192,16,0,,-56.9,
44,2020-12-14 23:10:04.470,339,1,1,-85.2,-12.2,21.9,287554,10,0,,-54.3,
46,2020-12-14 23:10:04.489,368,1,1,-91.9,-18.0,-6.4,13192,10,0,,-56.9,
46,2020-12-14 23:10:04.489,366,1,1,-86.5,-12.6,12.6,13188,14,0,,-56.9,
48,2020-12-14 23:10:04.551,366,1,1,-86.5,-13.5,3.9,13188,14,0,,-56.0,
48,2020-12-14 23:10:04.551,368,1,1,-91.7,-18.7,-7.1,13192,10,0,,-56.0,
50,2020-12-14 23:10:04.600,339,1,1,-85.3,-12.2,21.7,287554,10,0,,-54.3,
52,2020-12-14 23:10:04.662,246,1,1,-105.2,-28.6,-16.6,252207,1,0,,-56.6,
52,2020-12-14 23:10:04.662,247,1,1,-86.1,-9.5,16.1,252207,6,0,,-56.6,
54,2020-12-14 23:10:04.718,242,1,1,-101.3,-32.3,-20.8,287552,0,0,,-50.3,
54,2020-12-14 23:10:04.718,339,1,1,-81.4,-12.4,20.4,287549,10,0,,-50.3,
56,2020-12-14 23:10:04.878,24,1,1,-113.3,-19.2,-9.5,13250,10,0,,-74.1,
56,2020-12-14 23:10:04.878,363,1,1,-114.3,-20.2,-10.6,13223,6,0,,-74.1,
56,2020-12-14 23:10:04.878,194,1,1,-112.4,-18.3,-5.6,13196,33,0,,-74.1,
56,2020-12-14 23:10:04.878,394,1,1,-115.2,-21.4,-9.0,227924,21,0,,-73.8,
56,2020-12-14 23:10:04.878,26,1,1,-112.9,-18.8,-6.3,13250,7,0,,-74.1,
56,2020-12-14 23:10:04.878,355,1,1,-114.8,-20.7,-8.5,13212,27,0,,-74.1,
56,2020-12-14 23:10:04.878,285,1,1,-114.8,-20.7,-11.1,13251,6,0,,-74.1,
56,2020-12-14 23:10:04.878,393,1,1,-105.8,-12.0,0.3,227926,24,0,,-73.8,
58,2020-12-14 23:10:05.018,393,1,1,-108.4,-13.4,-2.2,227924,20,0,,-76.3,
58,2020-12-14 23:10:05.018,194,1,1,-118.7,-23.5,-11.1,13189,31,0,,-76.4,
58,2020-12-14 23:10:05.018,394,1,1,-113.9,-18.8,-6.2,227925,22,0,,-76.3,
58,2020-12-14 23:10:05.018,369,1,1,-118.4,-23.1,-13.7,13276,8,0,,-76.5,
58,2020-12-14 23:10:05.018,24,1,1,-110.1,-14.9,-4.3,13251,9,0,,-76.5,
58,2020-12-14 23:10:05.018,26,1,1,-116.3,-21.1,-8.7,13250,7,0,,-76.5,
58,2020-12-14 23:10:05.018,285,1,1,-117.9,-22.7,-13.2,13251,1,0,,-76.5,
58,2020-12-14 23:10:05.018,355,1,1,-116.0,-20.8,-8.3,13233,28,0,,-76.5,
60,2020-12-14 23:10:05.075,366,1,1,-74.0,-13.4,6.2,13194,10,0,,-40.6,
60,2020-12-14 23:10:05.075,368,1,1,-82.9,-22.3,-11.6,13191,10,0,,-40.6,
import unittest
import pandas as pd
from context import measprocess as mpc
import os
import pickle
def extract_scan(path):
filepath = os.path.join(path, 'ofdmscan.p0')
with open(filepath, "rb") as fh:
data = pickle.load(fh)
return data[0][7][1]['rec'][0], data[0][7][1]['ot']['Band'], data[0][7]['ot']['Ch']
def extract_gps(path):
filepath = os.path.join(path, 'gps.p0')
with open(filepath, "rb") as fh:
data = pickle.load(fh)
return data[0]['ot']
class TestSum(unittest.TestCase):
def test_synchronize(self):
path = [
f"tests/example_files/",
'scanner'
]
meas, band, channel = extract_scan(
os.path.join(*path)
)
class TestLinkFrames(unittest.TestCase):
def setUp(self):
#Init the Dummy Example
A = pd.DataFrame({'ref':[10,20,30, 31, 5], 'info': [5, 4, 1, 2, 1]})
A.index = range(2, 7)
meas['band'] = band
meas['channel'] = channel
B = pd.DataFrame({'ref':[15,17,27, 4], 'info': [1, 2, 3, 4], 'yo': [4, 6, 1, 8]})
B.index = range(9, 13)
gps = extract_gps(
os.path.join(*path)
)
self._A, self._B = A, B
self._meas = pd.read_csv("tests/example_files/scanner_1/measurement_example.csv", index_col=0)
self._gps = pd.read_csv("tests/example_files/scanner_1/gps_example.csv", index_col=0)
merged = mpc.preprocess.synchronize_df(meas, gps, 'Datetime', 'fk_gps', ['Lon.', 'Lat.', 'Quality'])
def test_multiindex_dummy(self):
combined = mpc.preprocess.link_dataframes(self._A, self._B, ref_col="ref")
print(merged)
self.assertTrue(
all(self._A == combined['A'])
)
def test_real_numeric_type(self):
with self.assertRaises(ValueError):
combined = mpc.preprocess.link_dataframes(
self._meas,
self._gps,
"Datetime",
unique_col="PCI"
)
def test_multiindex_real(self):
meas, gps = self._meas, self._gps
meas['Datetime'] = pd.to_datetime(meas['Datetime'])
gps['Datetime'] = pd.to_datetime(gps['Datetime'])
print(gps['Datetime'])
combined = mpc.preprocess.link_dataframes(
meas,
gps,
"Datetime",
unique_col="PCI"
)
#self.assertEqual(sum([1, 2, 3]), 6, "Should be 6")
self.assertTrue(
all(meas == combined['A'])
)
#def test_sum_tuple(self):
# self.assertEqual(sum((1, 2, 2)), 6, "Should be 6")
def test_duplicates_dummy(self):
with self.assertRaises(ValueError):
mpc.preprocess.link_dataframes(self._A, self._B, ref_col="info")
if __name__ == '__main__':
unittest.main()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment