Skip to content

functions

check_or_create_dir(directory)

Check if a directory exists and create it if it doesn't

Parameters:

Name Type Description Default
directory str

The directory path

required
Source code in gps_synth/common/functions.py
27
28
29
30
31
32
33
34
35
36
37
38
39
40
def check_or_create_dir(directory: str) -> None:
    """
    Check if a directory exists and create it if it doesn't

    Args:
        directory (str): The directory path
    """

    try:
        if not os.path.exists(directory):
            os.makedirs(directory)
    except OSError as e:
        print(f"Unable to handle input directory path '{directory}': {e}")
        raise

class_getter(module_path, class_name)

Create a pecified class

Parameters:

Name Type Description Default
module_path str

description

required
class_name str

description

required

Returns:

Name Type Description
Type Type

A class (https://stackoverflow.com/a/23198094)

Source code in gps_synth/common/functions.py
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
def class_getter(module_path: str, class_name: str) -> Type:
    """
    Create a pecified class

    Args:
        module_path (str): _description_
        class_name (str): _description_

    Returns:
        Type: A class (https://stackoverflow.com/a/23198094)
    """
    module = importlib.import_module(module_path)
    class_result = getattr(module, class_name)

    return class_result

delete_directory(directory)

Delete a directory both Empty or Non-Empty

Parameters:

Name Type Description Default
directory str

The directory path

required
Source code in gps_synth/common/functions.py
12
13
14
15
16
17
18
19
20
21
22
23
24
def delete_directory(directory: str) -> None:
    """
    Delete a directory both Empty or Non-Empty

    Args:
        directory (str): The directory path
    """

    try:
        shutil.rmtree(directory)
    except OSError as e:
        print(f"Unable to handle input directory path '{directory}': {e}")
        raise

write_df_to_parquet(df, base_dir, partition_cols=None, existing_data_behavior=None)

Writes dataframe to Parquet If df is empty, writes a file When using the same path, but with data, the file gets overwritten

Parameters:

Name Type Description Default
df DataFrame

Dataframe to write in parquet

required
base_dir str

Base directory where to write data

required
partition_cols Optional[List[str]] = None

A list of columns to use for partitioning, if None use [profile_name]

None
existing_data_behavior str

Controls how the dataset will handle data that already exists in the destination

None
Source code in gps_synth/common/functions.py
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
def write_df_to_parquet(
    df: DataFrame,
    base_dir: str,
    partition_cols: Optional[List[str]] = None,
    existing_data_behavior: Optional[str] = None,
) -> None:
    """
    Writes dataframe to Parquet
    If df is empty, writes a file
    When using the same path, but with data, the file gets overwritten

    Args:
        df (DataFrame): Dataframe to write in parquet
        base_dir (str): Base directory where to write data
        partition_cols (Optional[List[str]] = None): A list of columns to use for partitioning, if None use [profile_name]
        existing_data_behavior (str): Controls how the dataset will handle data that already exists in the destination
    """

    partition_cols = ["profile_name"] if partition_cols is None else partition_cols

    existing_data_behavior = (
        "overwrite_or_ignore"
        if existing_data_behavior is None
        else existing_data_behavior
    )

    # If empty df was saved earlier, we need to delete it
    # in order to save the partitioned stuff
    if os.path.exists(base_dir) and os.path.isfile(base_dir):
        os.remove(base_dir)

    table = pa.Table.from_pandas(df)

    del df

    # if path exists - overwrite
    # if path is unqiue - append
    ds.write_dataset(
        table,
        base_dir=base_dir,
        format="parquet",
        partitioning=partition_cols,
        existing_data_behavior=existing_data_behavior,
        partitioning_flavor="hive",
        basename_template="part-{i}" + f"{uuid.uuid4().hex}.parquet",
    )