Source code for date_extractor_mds.date_extractor_mds

import pandas as pd
import re
from datetime import datetime

[docs] def validate_datetime(input_value): """ Validates ISO 8601 datetime format compliance. Parameters ---------- input_value : str or pandas.Series The input to validate. Can be either a single string or a Pandas Series containing strings. Returns ------- None This function does not return a value. Raises ------ TypeError If the input is not a string or a Pandas Series. ValueError If the input string or Series elements don't match ISO 8601 format. ValueError If the Series contains non-string elements. Notes ----- Valid ISO 8601 format is: YYYY-MM-DDThh:mm:ss Any other format will raise a ValueError. """ def is_iso8601_compliant(date_str): """ Check if a single string is in ISO 8601 format. Parameters ---------- date_str : str The string to check. Returns ------- bool True if the string matches the ISO 8601 format, False otherwise. Notes ----- Valid ISO 8601 format is: YYYY-MM-DDThh:mm:ss """ iso8601_regex = r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}$" return bool(re.match(iso8601_regex, date_str)) if isinstance(input_value, str): # If input is a string, validate directly if not is_iso8601_compliant(input_value): raise ValueError(f"The input string '{input_value}' is not in valid ISO 8601 format.") elif isinstance(input_value, pd.Series): # If input is a Series, validate each element if not all(isinstance(item, str) for item in input_value): raise ValueError("All elements of the Pandas Series must be strings.") if not input_value.apply(is_iso8601_compliant).all(): raise ValueError("One or more elements in the Pandas Series are not in valid ISO 8601 format.") else: # Raise error if input is neither string nor Series raise TypeError("Input must be either a string or a Pandas Series of strings.")
[docs] def extract_year(iso_date: str) -> int: """ Extract the year from an ISO 8601 date string. This function accepts either an individual string, or a Pandas Series. Parameters ---------- iso_date : str or pandas.Series A date string, or Pandas Series containing strings, in ISO 8601 format (YYYY-MM-DDThh:mm:ss). Returns ------- int (if input was string) The year as a four-digit integer. pandas.Series (if input was pandas.Series) A pandas.Series containing years as four-digit integers. Examples -------- Extract the year from a single date string: >>> extract_year("2023-07-16T12:34:56") 2023 Apply the function to a Pandas Series: >>> import pandas as pd >>> data = {'dates': ["2023-07-16T12:34:56", "2024-03-25T08:15:30"]} >>> df = pd.DataFrame(data) >>> year = extract_year(df['dates']) >>> print(year) 0 2023 1 2024 Name: dates, dtype: int64 """ def extract_year_from_string(iso_date: str) -> int: """ Extract the year from a single ISO 8601 date string. Parameters ---------- iso_date : str A date string in ISO 8601 format (YYYY-MM-DDThh:mm:ss). Returns ------- int The year as a four-digit integer. """ return int(iso_date.split("-")[0]) # Validate the input validate_datetime(iso_date) # Handle string or Pandas Series input if isinstance(iso_date, str): return extract_year_from_string(iso_date) else: return iso_date.apply(extract_year_from_string)
[docs] def extract_month(input_data) -> str: """ Extract the month from an ISO 8601 date string or a DataFrame column. This function accepts either an individual string, or a Pandas Series. Parameters ---------- input_data : str or pandas.Series A single ISO 8601 date string (YYYY-MM-DDThh:mm:ss) or a Pandas Series containing a column with such date strings. Returns ------- int or pandas.Series If input is a string, returns the month as an integer (1-12). If input is a pandas.Series, returns a Pandas Series with the extracted months. Examples -------- Extract the month from a single ISO 8601 string: >>> extract_month("2023-07-16T12:34:56") 7 Process a Pandas Series column containing ISO 8601 strings: >>> import pandas as pd >>> data = {'dates': ["2023-07-16T12:34:56", "2024-03-25T12:34:56"]} >>> df = pd.DataFrame(data) >>> months = extract_month(df["dates"]) >>> print(months) 0 7.0 1 3.0 dtype: float64 """ # Validate the datetime input validate_datetime(input_data) # Define function to extract a single datetime string def extract_single_month(datetime_str): """ Given a valid ISO 8601 format string, return the time as a datetime Parameters ---------- datetime_str : str A valid ISO 8601 date string (e.g., "2023-07-16T12:34:56"). Returns ------- int The month as an integer (1-12). Examples -------- >>> extract_single_month("2023-07-16T12:34:56") 7 """ time_obj = datetime.strptime(datetime_str.split('T')[0], "%Y-%m-%d") return time_obj.month if isinstance(input_data, str): return extract_single_month(input_data) else: return input_data.apply(extract_single_month)
[docs] def extract_day(datetime_input): """ Extract the day from an ISO 8601 date string. This function can handle both individual strings and Pandas Series. Parameters ---------- iso_date : str or pandas.Series A date string, or Pandas Series containing strings, in ISO 8601 format (YYYY-MM-DDThh:mm:ss). Returns ------- int The day as an integer (1-31) if input was string pandas.Series A pandas.Series containing day as two-digit integers if input was pandas.Series. Examples -------- >>> extract_day("2023-07-16T12:34:56") 16 Apply the function to a Pandas Series: >>> import pandas as pd >>> data = {'dates': ["2023-07-16T12:34:56", "2024-03-25T08:15:30"]} >>> df = pd.DataFrame(data) >>> day = extract_day(df['dates']) >>> print(day) 0 16 1 25 Name: dates, dtype: int64 """ validate_datetime(datetime_input) # Validate fuction if isinstance(datetime_input, str): day = int(datetime_input[8:10]) return day else: datetime_input.apply(validate_datetime) # Validate each date in the Series days = datetime_input.apply(lambda x: int(x[8:10])) return days
[docs] def extract_time(datetime_input) -> str: """ Extract the time from an ISO 8601 datetime string or a Pandas Series of ISO 8601 datetime strings. This function accepts either an individual string, or a Pandas Series. Parameters ---------- datetime_input : str or pandas.Series A datetime string, or a Pandas Series containing datetime strings, in ISO 8601 format (YYYY-MM-DDThh:mm:ss). Returns ------- datetime.time (if input was string) The time as a datetime.time object. pandas.Series (if input was pandas.Series) A pandas.Series containing rows of datetime.time objects. Examples -------- Extract the time from a single date string: >>> extract_time("2023-07-16T12:34:56") datetime.time(12, 34, 56) Apply the function to a Pandas DataFrame column: >>> import pandas as pd >>> data = {'dates': ["2023-07-16T12:34:56", "2024-03-25T08:15:30"]} >>> df = pd.DataFrame(data) >>> times = extract_time(df['dates']) >>> print(times) 0 12:34:56 1 08:15:30 Name: dates, dtype: object """ # Validate the datetime input validate_datetime(datetime_input) # Define function to extract a single datetime string def extract_single_time(datetime_str): # Given a valid ISO 8601 format string, return the time as a datetime time_string = datetime_str.split('T')[1] time_obj = datetime.strptime(time_string, "%H:%M:%S").time() return time_obj if isinstance(datetime_input, str): return extract_single_time(datetime_input) else: return datetime_input.apply(extract_single_time)