From fe57a1f6e6f6b165fc890c7a6ac8f71144016e64 Mon Sep 17 00:00:00 2001 From: Emmanuel MALHERBE Date: Tue, 13 Jan 2026 18:34:25 +0100 Subject: [PATCH 01/10] ADD: removed prefix and suffix logic. Simplifies doc and code --- choice_learn/data/choice_dataset.py | 95 ++++++----------------------- 1 file changed, 19 insertions(+), 76 deletions(-) diff --git a/choice_learn/data/choice_dataset.py b/choice_learn/data/choice_dataset.py index edc4902f..1300254c 100644 --- a/choice_learn/data/choice_dataset.py +++ b/choice_learn/data/choice_dataset.py @@ -884,11 +884,8 @@ def from_single_wide_df( df, items_id, shared_features_columns=None, - items_features_suffixes=None, - items_features_prefixes=None, - available_items_suffix=None, - available_items_prefix=None, - delimiter="_", + items_features_patterns=None, + available_items_pattern=None, choices_column="choice", choice_format="items_id", ): @@ -902,21 +899,14 @@ def from_single_wide_df( List of items ids shared_features_columns : list, optional List of columns of the dataframe that are shared_features_by_choice, default is None - items_features_prefixes : list, optional - Prefixes of the columns of the dataframe that are items_features_by_choice, + items_features_patterns : list of str, optional + Patterns of the columns of the dataframe that are items_features_by_choice, + given as "*suffix" or "prefix*" where "*" is replaced by items_id in df columns. default is None - items_features_suffixes : list, optional - Suffixes of the columns of the dataframe that are items_features_by_choice, + available_items_pattern: str, optional + Pattern of the columns of the dataframe that are available_items_by_choice, + given as "*suffix" or "prefix*" where "*" is replaced by items_id in df columns. default is None - available_items_prefix: str, optional - Prefix of the columns of the dataframe that precise available_items_by_choice, - default is None - available_items_suffix: str, optional - Suffix of the columns of the dataframe that precise available_items_by_choice, - default is None - delimiter: str, optional - Delimiter used to separate the given prefix or suffixes and the features names, - default is "_" choice_column: str, optional Name of the column containing the choices, default is "choice" choice_format: str, optional @@ -928,11 +918,6 @@ def from_single_wide_df( ChoiceDataset corresponding ChoiceDataset """ - if available_items_prefix is not None and available_items_suffix is not None: - raise ValueError( - "You cannot give both available_items_prefix and\ - available_items_suffix." - ) if choice_format not in ["items_index", "items_id"]: logging.warning("choice_format not understood, defaulting to 'items_index'") @@ -943,43 +928,12 @@ def from_single_wide_df( shared_features_by_choice = None shared_features_by_choice_names = None - if items_features_suffixes is not None and items_features_prefixes is not None: - # The list of features names is the concatenation of the two lists of - # prefixes and suffixes - items_features_names = items_features_prefixes + items_features_suffixes - items_features_by_choice = [] - for item in items_id: - columns = [f"{feature}{delimiter}{item}" for feature in items_features_prefixes] + [ - f"{item}{delimiter}{feature}" for feature in items_features_suffixes - ] - for col in columns: - if col not in df.columns: - logging.warning( - f"Column {col} was not in DataFrame,\ - dummy creation of the feature with zeros." - ) - df[col] = 0 - items_features_by_choice.append(df[columns].to_numpy()) - items_features_by_choice = np.stack(items_features_by_choice, axis=1) - elif items_features_suffixes is not None: - items_features_names = items_features_suffixes + if items_features_patterns is not None: + assert all(["*" in pattern for pattern in items_features_patterns]), \ + "items_features_patterns should all contain '*' character." items_features_by_choice = [] for item in items_id: - columns = [f"{item}{delimiter}{feature}" for feature in items_features_suffixes] - for col in columns: - if col not in df.columns: - logging.warning( - f"Column {col} was not in DataFrame,\ - dummy creation of the feature with zeros." - ) - df[col] = 0 - items_features_by_choice.append(df[columns].to_numpy()) - items_features_by_choice = np.stack(items_features_by_choice, axis=1) - elif items_features_prefixes is not None: - items_features_names = items_features_prefixes - items_features_by_choice = [] - for item in items_id: - columns = [f"{feature}{delimiter}{item}" for feature in items_features_prefixes] + columns = [feature.replace("*", item) for feature in items_features_patterns] for col in columns: if col not in df.columns: logging.warning( @@ -993,31 +947,20 @@ def from_single_wide_df( items_features_by_choice = None items_features_names = None - if available_items_suffix is not None: - if isinstance(available_items_suffix, list): - if not len(available_items_suffix) == len(items_id): - raise ValueError( - "You have given a list of columns for availabilities." - "We consider that it is one for each item however lenghts do not match" - ) - logging.info("You have given a list of columns for availabilities.") - logging.info("Each column will be matched to an item, given their order") - available_items_by_choice = df[available_items_suffix].to_numpy() - else: - columns = [f"{item}{delimiter}{available_items_suffix}" for item in items_id] - available_items_by_choice = df[columns].to_numpy() - elif available_items_prefix is not None: - if isinstance(available_items_prefix, list): - if not len(available_items_prefix) == len(items_id): + if available_items_pattern is not None: + if isinstance(available_items_pattern, list): + if not len(available_items_pattern) == len(items_id): raise ValueError( "You have given a list of columns for availabilities." "We consider that it is one for each item however lenghts do not match" ) logging.info("You have given a list of columns for availabilities.") logging.info("Each column will be matched to an item, given their order") - available_items_by_choice = df[available_items_prefix].to_numpy() + available_items_by_choice = df[available_items_pattern].to_numpy() else: - columns = [f"{available_items_prefix}{delimiter}{item}" for item in items_id] + assert "*" in available_items_pattern, \ + "available_items_pattern should contain '*' character." + columns = [available_items_pattern.replace("*", item) for item in items_id] available_items_by_choice = df[columns].to_numpy() else: available_items_by_choice = None From 75888343608475d21bd1fc18f82d0007fa96355e Mon Sep 17 00:00:00 2001 From: Emmanuel MALHERBE Date: Tue, 13 Jan 2026 18:59:41 +0100 Subject: [PATCH 02/10] REFAC: some change in doc --- notebooks/introduction/2_data_handling.ipynb | 403 +++++++++---------- 1 file changed, 194 insertions(+), 209 deletions(-) diff --git a/notebooks/introduction/2_data_handling.ipynb b/notebooks/introduction/2_data_handling.ipynb index 7fb91043..7da51581 100644 --- a/notebooks/introduction/2_data_handling.ipynb +++ b/notebooks/introduction/2_data_handling.ipynb @@ -52,7 +52,7 @@ "\n", "- [**Introduction**](#an-introduction-to-choicedataset)\n", " - [Example dataset: SwissMetro](#our-example-dataset-swissmetro)\n", - " - [The different types of data](#the-different-type-of-data)\n", + " - [The different components of data](#the-different-type-of-data)\n", "- [**ChoiceDataset's Instantiation from a single DataFrame**](#hands-on:-example-from-a-panda's-dataframe)\n", " - [Wide format](#creating-a-choicedataset-from-a-wide-dataframe)\n", " - [Long format](#creating-a-choicedataset-from-a-long-dataframe)\n", @@ -86,7 +86,50 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Our example dataset: SwissMetro\n", + "## The different components of data\n", + "\n", + "We can split the columns into three distincts categories that are common to most choice modelling use-cases:\n", + "\n", + "- Choices - or outputs of our model: it's what we want to predict\n", + "- Features - or inputs of our model\n", + "- Availabilities - or the description of the set among which the customer chooses\n", + "\n", + "Going further, we have two types of features: the features describing the customer and the features describing the mean of transportation. Those are the four types of data that can be specified in a ChoiceDataset.\n", + "\n", + "\n", + "**Vocabulary:**\n", + "\n", + "*Items* represent a product, an alternative that can be chosen by the customer at some point.\n", + "\n", + "\n", + "Throughout Choice-Learn examples and code here is the naming of our four types of data:\n", + "\n", + "- **choices:** which item has been chosen among all availables\n", + "\n", + "- **shared_features_by_choice:** It represents all the features that might change from one choice to another and that are **common** to all items (e.g. day of week, customer features, etc...).\n", + " \n", + "- **items_features_by_choice:** The features each of the available item for a choice (e.g. prices might change from one choice to another and are specific to each sold item).\n", + " \n", + "- **available_items_by_choice:** For each choice it represents whether each item is proposed to the customer (1.) or not (0.).\n", + "\n", + "**Summary:**\n", + "\n", + "| index | feature | typical shape | Example | Taken Values |\n", + "|---|---|---|---|---|\n", + "| 1 | shared_features_by_choice | (n_choices, n_features) | customer age, day of week | float, int |\n", + "| 2 | items_features_by_choice | (n_choices, n_items, n_items_features) | price | float, int |\n", + "| 3 | available_items_by_choice | (n_choices, n_items) | | 1.(av) or 0. (not av.) |\n", + "| 4 | choices | (n_choices,) | | int: index of chosen item |\n", + "\n", + "\n", + "![DatasetDiagram](../../docs/illustrations/choice_learn_dataset.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Hands-on: Example of SwissMetro, a DataFrame in wide format\n", "\n", "The SwissMetro[2] is a well-known dataset used to illustrate choice modelling. The dataset is provided with the Choice-Learn package and can be downloaded as follows:" ] @@ -139,91 +182,19 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### The different type of data\n", - "\n", - "We can split the columns into three distincts categories that are common to most choice modelling use-cases:\n", - "\n", - "- Choices - or outputs of our model: it's what we want to predict\n", - "- Features - or inputs of our model\n", - "- Availabilities - or the description of the set among which the customer chooses\n", - "\n", - "Going further, we have two types of features: the features describing the customer and the features describing the mean of transportation. Those are the four types of data that can be specified in a ChoiceDataset.\n", - "\n", - "\n", - "**Vocabulary:**\n", - "\n", - "*Items* represent a product, an alternative that can be chosen by the customer at some point.\n", - "\n", - "\n", - "Throughout Choice-Learn examples and code here is the naming of our four types of data:\n", - "\n", - "- **choices:** which item has been chosen among all availables\n", - "\n", - "- **shared_features_by_choice:** It represents all the features that might change from one choice to another and that are **common** to all items (e.g. day of week, customer features, etc...).\n", - " \n", - "- **items_features_by_choice:** The features each of the available item for a choice (e.g. prices might change from one choice to another and are specific to each sold item).\n", - " \n", - "- **available_items_by_choice:** For each choice it represents whether each item is proposed to the customer (1.) or not (0.).\n", - "\n", - "**Summary:**\n", - "\n", - "| index | feature | typical shape | Example | Taken Values |\n", - "|---|---|---|---|---|\n", - "| 1 | shared_features_by_choice | (n_choices, n_features) | customer age, day of week | float, int |\n", - "| 2 | items_features_by_choice | (n_choices, n_items, n_items_features) | price | float, int |\n", - "| 3 | available_items_by_choice | (n_choices, n_items) | | 1.(av) or 0. (not av.) |\n", - "| 4 | choices | (n_choices,) | | int: index of chosen item |\n", - "\n", - "\n", - "![DatasetDiagram](../../docs/illustrations/choice_learn_dataset.png)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Hands-on: example from a pandas' DataFrame\n", - "\n", - "The easiest way create a ChoiceDataset is to use a pandas DataFrame.\n", - "\n", - "First, here is a small explanation about wide vs long format, in case you have never heard about it, from [Wikipedia](https://en.wikipedia.org/wiki/Wide_and_narrow_data).\n", - "\n", - "*Long (or narrow) Format:* One column containing all the values and another column listing the context of the value\\\n", - "*Wide Format:* Each different data variable in a separate column.\n", + "Our example dataframe on SwissMetro is on the wide format. Each row indicates a choice and each item has its specific features columns. \n", "\n", "\n", - "\n", + "\n", "\n", "
Example Long Format: Example Wide Format:
Example Wide Format:
\n", "\n", - "| choice id | item | price | availability | choice |\n", - "|---|---|---|---|---|\n", - "| 1 | A | 2.0 | 1 | 1 |\n", - "| 1 | B | 6.0 | 1 | 0 |\n", - "| 2 | A | 1.5 | 1 | 0 |\n", - "| 2 | B | 5.5 | 1 | 1 |\n", - "\n", - "\n", - "\n", "| choice id | price_A | price_B | availability_A | availability_B | choice |\n", "|---|---|---|---|---|---|\n", "| 1 | 2.0 | 6.0 | 1 | 1 | A |\n", "| 2 | 1.5 | 5.5 | 1 | 1 | B |\n", "\n", - "
\n", - "\n", - "Choice-Learn handles both formats, but slightly differently:\n", - "- example for [wide](#creating-a-choicedataset-from-a-wide-dataframe) format\n", - "- example for [long](#creating-a-choicedataset-from-a-long-dataframe) format" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Creating a ChoiceDataset from a wide DataFrame\n", - "\n", - "Our example dataframe on SwissMetro is on the wide format. Each row indicates a choice and each item has its specific features columns." + " " ] }, { @@ -290,8 +261,7 @@ "items_features_by_choice and available_items_by_choice:\n", "\n", "It is possible to precise:\n", - "- Suffixes: in this case the column used will be \"item_id\" + \"delimiter\" + \"suffix\"\n", - "- Prefixes: in this case the column used will be \"prefix\" + \"delimiter\" + \"item_id\"\n", + "- Patterns: in this case the column used will be by replacing \"*\" in patterns string with \"item_id\"\n", "- Columns: each item's features in list. In this case it is you duty to ensure coherence in terms of items and features orders. For our example it would be:\n", "\n", " ```python\n", @@ -304,14 +274,29 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Creating a ChoiceDataset from a long DataFrame\n", + "## Hands-on: Creating a ChoiceDataset from a DataFrame in long format\n", + "Creating a ChoiceDataset from a long DataFrame\n", "The long format is also commonly used in which each row represents an alternative. One of its benefits is represent unavailability through missing rows - taking litteraly zero memory space. On the contrary the 'shared_features' such as customer features must be duplicated on each row.\\\n", - "The ChoiceDataset object can be instantiated from a long DF. It will infer the availabilities from existing/missing rows, if it is not specified.\\\n", + "The ChoiceDataset object can be created from a long DF. It will infer the availabilities from existing/missing rows, if it is not specified.\\\n", "It is needed to precise:\n", "- columns representing the features ('shared_features_columns' and 'items_features_columns')\n", "- the column in which the choice is given and how it is formatted ('choices_columns' and 'choice_format')\n", "- which column can identify the items ('items_id_column')\n", - "- which column can identify all the rows corresponding to the same choice ('choices_id_column')" + "- which column can identify all the rows corresponding to the same choice ('choices_id_column')\n", + "\n", + "\n", + "\n", + "\n", + "
Example Long Format:
\n", + "\n", + "| choice id | item | price | availability | choice |\n", + "|---|---|---|---|---|\n", + "| 1 | A | 2.0 | 1 | 1 |\n", + "| 1 | B | 6.0 | 1 | 0 |\n", + "| 2 | A | 1.5 | 1 | 0 |\n", + "| 2 | B | 5.5 | 1 | 1 |\n", + "\n", + "
" ] }, { @@ -381,12 +366,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Instantiation from different objects\n", + "## Creating a Choice Dataset: General case\n", "\n", - "For RAM optimization purposes or just because of the format of the data source, it might happen that a dataset is split into separate files. You can instantiate a ChoiceDataset keeping this structure, saving time to concatenate everything.\\\n", + "For RAM optimization purposes or just because of the format of the data source, it might happen that a dataset is split into separate files. You can create a ChoiceDataset keeping this structure, saving time to concatenate everything.\\\n", "You can work either with pandas.DataFrames or numpy.ndarrays.\n", "\n", - "### Separating data types\n", + "### Separating data components: Swissmetro example\n", "The four distinct data types: choices, shared_features_by_choice, items_features_by_choice, available_items_by_choice can be manually given to the ChoiceDataset:" ] }, @@ -486,11 +471,11 @@ "\n", "### Estimating choice models\n", "\n", - "With your ChoiceDataset instantiated, it can be used as is to fit choice models. An illustration can be found in the conditional MNL introduction [notebook](./3_model_clogit.ipynb).\n", + "With your ChoiceDataset created, it can be used as is to fit choice models. An illustration can be found in the conditional MNL introduction [notebook](./3_model_clogit.ipynb).\n", "\n", "### Slicing and batching\n", "\n", - "ChoiceDatasets are indexed by choice, meaning that accessing the i-th index corresponds to the i-th choice. Differently said it is the i-th value of the object given as 'choices' in the ChoiceDataset instantiation.\n", + "ChoiceDatasets are indexed by choice, meaning that accessing the i-th index corresponds to the i-th choice. Differently said it is the i-th value of the object given as 'choices' in the ChoiceDataset creation.\n", "\n", "A ChoiceDataset can be sliced commonly using the [.] Python method:" ] @@ -540,7 +525,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## More Advanced use: the FeatureStorage & RAM optimization\n", + "## Advanced use: the FeatureStorage & RAM optimization for big dataset\n", "\n", "In many use-cases we will see features or group of features values being repeated over the dataset. For example if one customer comes several times, its features will be repeated. With One-Hot representations, it can create memory-heavy repetitions.\\\n", "Choice-Learn introduces FeaturesStorage and FeaturesByIds in order to limit the memory usage before accessing a batch of data.\n", @@ -732,127 +717,6 @@ "Other examples of features_by_ids usage can be found [here](data/features_byID_examples.ipynb)." ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Additional Examples" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### The ModeCanada dataset\n", - "\n", - "We will use the ModeCanada [1] dataset for this example. The dataset is originally in the long format. It is provided with the choice-learn package and can loaded as follows:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from choice_learn.datasets import load_modecanada\n", - "\n", - "canada_transport_df = load_modecanada(as_frame=True)\n", - "canada_transport_df.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "An extensive description of the dataset can be found [here](https://www.ssc.wisc.edu/~bhansen/econometrics/Koppelman_description.pdf). An extract indicates:\n", - "\n", - "\"The dataset was assembled in 1989 by VIA Rail (the Canadian national rail carrier) to estimate the demand for high-speed rail in the Toronto-Montreal corridor. The main information source was a Passenger Review administered to business travelers augmented by information about each trip. The observations consist of a choice between four modes of transportation (train, air, bus, car) with information about the travel mode and about the passenger. The posted dataset has been balanced to only include cases where all four travel modes are recorded. The file contains 11,116 observations on 2779 individuals. \"\n", - "\n", - "Alright ! If we go back to our dataframe, we can see the following columns:\n", - "\n", - "case: an ID of the traveler\n", - "alt: the alternative concerned by the row\n", - "choice: 1 if the alternative was chosen, 0 otherwise\n", - "dist: trip distance\n", - "cost: trip cost\n", - "ivt: travel time in-vehicule (minutes)\n", - "ovt: travel time out-vehicule (minutes)\n", - "income: housold income of traveler ($)\n", - "urban: 1 if origin or destination is a large city\n", - "noalt: the number of alternative among which the traveler had to chose\n", - "freq: the frequence of the alternative (0 for car) (e.g. how many train by hour)\n", - "Following our specification, we can see that one case corresponds to one customer thus one choice. In our choice-learn language it corresponds to \"one context\": a set of available alternatives and their features/specificites resulting in one choice. Let's regroup our features:\n", - "\n", - "**choices:** Easy ! It is the alternative whenever the value is one.\n", - "\n", - "**shared_features_by_choice:** The income, urban and distance (also noalt which is not really a feature) features are the same for all the alternatives within a single choice. They are all constant with respect to (case=traveler_ID).\n", - "\n", - "**items_features_by_choice:** Ivt, Ovt, cost and freq depends on and describe each of the alternative.\n", - "\n", - "**available_items_by_choice:** It in not directly indicated, however it can be easily deduced. Whenever an alternative is not available, it is not precised for its case. For example for the case=1, our first choice, only train and car are given as alternatives, meaning that air and bus could not be chosen/were not available." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dataset = ChoiceDataset.from_single_long_df(\n", - " df=canada_transport_df,\n", - " choices_column=\"choice\",\n", - " items_id_column=\"alt\",\n", - " choices_id_column=\"case\",\n", - " shared_features_columns=[\"income\", \"urban\", \"dist\"],\n", - " items_features_columns=[\"cost\", \"freq\", \"ovt\", \"ivt\"],\n", - " choice_format=\"one_zero\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In this example the 'choice_format' is \"one_zero\" while it was \"item_id\" in our previous SwissMetro example. As a short memento it specifies how the chosen alternative is precised: with ones (chosen) and zeros (not chosen) or directlu with the item_id of the chosen item.\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "
\"one_zero\" \"item_id\"
\n", - "\n", - "\n", - "| | case | alt | choice | dist | cost | ivt | ovt | freq | \tincome |\n", - "|---|---|---|---|---|---|---|---|---|---|\n", - "| 1 | 1 | train | 0 | 83 | 28.25 | 50 | 66 | 4 | 45 |\n", - "| 2 | 1 | car | 1 | 83 | 15.77 | 61 | 0 | 0 | 45 |\n", - "| 3 | 2 | train | 0 | 83 | 28.25 | 50 | 66 | 4 | 25 |\n", - "| 4 | 2 | car | 1 | 83 | 15.77 | 61 | 0 | 0 | 25 |\n", - "| 5 | 3 | train | 0 | 83 | 28.25 | 50 | 66 | 4 | 70 |\n", - "\n", - "\n", - "\n", - "\n", - "| | case | alt | choice | dist | cost | ivt | ovt | freq | \tincome |\n", - "|---|---|---|---|---|---|---|---|---|---|\n", - "| 1 | 1 | train | car | 83 | 28.25 | 50 | 66 | 4 | 45 |\n", - "| 2 | 1 | car | car | 83 | 15.77 | 61 | 0 | 0 | 45 |\n", - "| 3 | 2 | train | car | 83 | 28.25 | 50 | 66 | 4 | 25 |\n", - "| 4 | 2 | car | car | 83 | 15.77 | 61 | 0 | 0 | 25 |\n", - "| 5 | 3 | train | car | 83 | 28.25 | 50 | 66 | 4 | 70 |\n", - "\n", - "
\n", - "\n", - "In the first 5 examples, the chosen transportation is always the car." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "That's it !" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -1189,6 +1053,127 @@ "- More in-depth examples and explanations can be found [here](./features_byID_example.ipynb)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Additional datasets" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### The ModeCanada dataset\n", + "\n", + "We will use the ModeCanada [1] dataset for this example. The dataset is originally in the long format. It is provided with the choice-learn package and can loaded as follows:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from choice_learn.datasets import load_modecanada\n", + "\n", + "canada_transport_df = load_modecanada(as_frame=True)\n", + "canada_transport_df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "An extensive description of the dataset can be found [here](https://www.ssc.wisc.edu/~bhansen/econometrics/Koppelman_description.pdf). An extract indicates:\n", + "\n", + "\"The dataset was assembled in 1989 by VIA Rail (the Canadian national rail carrier) to estimate the demand for high-speed rail in the Toronto-Montreal corridor. The main information source was a Passenger Review administered to business travelers augmented by information about each trip. The observations consist of a choice between four modes of transportation (train, air, bus, car) with information about the travel mode and about the passenger. The posted dataset has been balanced to only include cases where all four travel modes are recorded. The file contains 11,116 observations on 2779 individuals. \"\n", + "\n", + "Alright ! If we go back to our dataframe, we can see the following columns:\n", + "\n", + "case: an ID of the traveler\n", + "alt: the alternative concerned by the row\n", + "choice: 1 if the alternative was chosen, 0 otherwise\n", + "dist: trip distance\n", + "cost: trip cost\n", + "ivt: travel time in-vehicule (minutes)\n", + "ovt: travel time out-vehicule (minutes)\n", + "income: housold income of traveler ($)\n", + "urban: 1 if origin or destination is a large city\n", + "noalt: the number of alternative among which the traveler had to chose\n", + "freq: the frequence of the alternative (0 for car) (e.g. how many train by hour)\n", + "Following our specification, we can see that one case corresponds to one customer thus one choice. In our choice-learn language it corresponds to \"one context\": a set of available alternatives and their features/specificites resulting in one choice. Let's regroup our features:\n", + "\n", + "**choices:** Easy ! It is the alternative whenever the value is one.\n", + "\n", + "**shared_features_by_choice:** The income, urban and distance (also noalt which is not really a feature) features are the same for all the alternatives within a single choice. They are all constant with respect to (case=traveler_ID).\n", + "\n", + "**items_features_by_choice:** Ivt, Ovt, cost and freq depends on and describe each of the alternative.\n", + "\n", + "**available_items_by_choice:** It in not directly indicated, however it can be easily deduced. Whenever an alternative is not available, it is not precised for its case. For example for the case=1, our first choice, only train and car are given as alternatives, meaning that air and bus could not be chosen/were not available." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dataset = ChoiceDataset.from_single_long_df(\n", + " df=canada_transport_df,\n", + " choices_column=\"choice\",\n", + " items_id_column=\"alt\",\n", + " choices_id_column=\"case\",\n", + " shared_features_columns=[\"income\", \"urban\", \"dist\"],\n", + " items_features_columns=[\"cost\", \"freq\", \"ovt\", \"ivt\"],\n", + " choice_format=\"one_zero\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this example the 'choice_format' is \"one_zero\" while it was \"item_id\" in our previous SwissMetro example. As a short memento it specifies how the chosen alternative is precised: with ones (chosen) and zeros (not chosen) or directlu with the item_id of the chosen item.\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
\"one_zero\" \"item_id\"
\n", + "\n", + "\n", + "| | case | alt | choice | dist | cost | ivt | ovt | freq | \tincome |\n", + "|---|---|---|---|---|---|---|---|---|---|\n", + "| 1 | 1 | train | 0 | 83 | 28.25 | 50 | 66 | 4 | 45 |\n", + "| 2 | 1 | car | 1 | 83 | 15.77 | 61 | 0 | 0 | 45 |\n", + "| 3 | 2 | train | 0 | 83 | 28.25 | 50 | 66 | 4 | 25 |\n", + "| 4 | 2 | car | 1 | 83 | 15.77 | 61 | 0 | 0 | 25 |\n", + "| 5 | 3 | train | 0 | 83 | 28.25 | 50 | 66 | 4 | 70 |\n", + "\n", + "\n", + "\n", + "\n", + "| | case | alt | choice | dist | cost | ivt | ovt | freq | \tincome |\n", + "|---|---|---|---|---|---|---|---|---|---|\n", + "| 1 | 1 | train | car | 83 | 28.25 | 50 | 66 | 4 | 45 |\n", + "| 2 | 1 | car | car | 83 | 15.77 | 61 | 0 | 0 | 45 |\n", + "| 3 | 2 | train | car | 83 | 28.25 | 50 | 66 | 4 | 25 |\n", + "| 4 | 2 | car | car | 83 | 15.77 | 61 | 0 | 0 | 25 |\n", + "| 5 | 3 | train | car | 83 | 28.25 | 50 | 66 | 4 | 70 |\n", + "\n", + "
\n", + "\n", + "In the first 5 examples, the chosen transportation is always the car." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "That's it !" + ] + }, { "cell_type": "markdown", "metadata": {}, From 1253df803b10d6c84271ba298259ba8b2ec48a11 Mon Sep 17 00:00:00 2001 From: Emmanuel MALHERBE Date: Thu, 22 Jan 2026 18:17:34 +0100 Subject: [PATCH 03/10] FIX: new way to load datasets --- choice_learn/datasets/base.py | 44 ++++++++++++++++------------------- 1 file changed, 20 insertions(+), 24 deletions(-) diff --git a/choice_learn/datasets/base.py b/choice_learn/datasets/base.py index 22ce59de..dee10e25 100644 --- a/choice_learn/datasets/base.py +++ b/choice_learn/datasets/base.py @@ -338,9 +338,9 @@ def load_swissmetro(add_items_one_hot=False, as_frame=False, return_desc=False, df=swiss_df, items_id=items, shared_features_columns=shared_features_by_choice_names, - items_features_suffixes=items_features_by_choice_names - + ["ASC_TRAIN", "ASC_SM", "ASC_CAR"], - available_items_suffix=availabilities_column, + items_features_patterns=["*_%s" % column for column in ( + items_features_by_choice_names + ["ASC_TRAIN", "ASC_SM", "ASC_CAR"])], + available_items_pattern="*_%s" % availabilities_column, choices_column=choice_column, choice_format="items_index", ) @@ -532,8 +532,8 @@ def load_swissmetro(add_items_one_hot=False, as_frame=False, return_desc=False, df=swiss_df, items_id=items, shared_features_columns=shared_features_by_choice_names, - items_features_suffixes=items_features_by_choice_names, - available_items_suffix=availabilities_column, + items_features_patterns=["*_%s" % s for s in items_features_by_choice_names], + available_items_pattern="*_%s" % availabilities_column, choices_column=choice_column, choice_format="items_index", ) @@ -927,9 +927,7 @@ def load_train( df=train_df, items_id=["1", "2"], shared_features_columns=["id"], - items_features_prefixes=["price", "time", "change", "comfort"], - delimiter="", - available_items_suffix=None, + items_features_patterns=["price*", "time*", "change*", "comfort*"], choices_column="choice", choice_format="items_id", ) @@ -974,17 +972,17 @@ def load_car_preferences( cars_df["choice"] = cars_df.apply(lambda row: row.choice[-1], axis=1) shared_features = ["college", "hsg2", "coml5"] items_features = [ - "type", - "fuel", - "price", - "range", - "acc", - "speed", - "pollution", - "size", - "space", - "cost", - "station", + "type*", + "fuel*", + "price*", + "range*", + "acc*", + "speed*", + "pollution*", + "size*", + "space*", + "cost*", + "station*", ] items_id = [f"{i}" for i in range(1, 7)] @@ -992,8 +990,7 @@ def load_car_preferences( df=cars_df, items_id=items_id, shared_features_columns=shared_features, - items_features_prefixes=items_features, - delimiter="", + items_features_patterns=items_features, choices_column="choice", choice_format="items_id", ) @@ -1060,8 +1057,7 @@ def load_hc( return ChoiceDataset.from_single_wide_df( df=hc_df, shared_features_columns=["income"], - items_features_prefixes=["ich", "och", "occa", "icca"], - delimiter=".", + items_features_patterns=["ich.*", "och.*", "occa.*", "icca.*"], items_id=items_id, choices_column="depvar", choice_format="items_id", @@ -1206,7 +1202,7 @@ def load_londonpassenger( df=london_df, items_id=items, shared_features_columns=shared_features_by_choice_names, - items_features_suffixes=items_features_by_choice_names, + items_features_patterns=["*_%s" % s for s in items_features_by_choice_names], delimiter="_", choices_column=choice_column, choice_format="items_index", From 5e81a6c58546d40ce44db18e69f62fdb3b8a91d1 Mon Sep 17 00:00:00 2001 From: Emmanuel MALHERBE Date: Thu, 22 Jan 2026 18:32:51 +0100 Subject: [PATCH 04/10] FIX: updated notebook tutorial with pattern instead of suffix --- notebooks/data/dataset_creation.ipynb | 4 ++-- notebooks/introduction/2_data_handling.ipynb | 12 +++++------- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/notebooks/data/dataset_creation.ipynb b/notebooks/data/dataset_creation.ipynb index 08dd7e3c..0c9f35b0 100644 --- a/notebooks/data/dataset_creation.ipynb +++ b/notebooks/data/dataset_creation.ipynb @@ -677,8 +677,8 @@ " items_id=[\"TRAIN\", \"SM\", \"CAR\"],\n", " shared_features_columns=[\"GROUP\", \"SURVEY\", \"SP\", \"PURPOSE\", \"FIRST\", \"TICKET\", \"WHO\", \"LUGGAGE\", \"AGE\",\n", " \"MALE\", \"INCOME\", \"GA\", \"ORIGIN\", \"DEST\"],\n", - " items_features_suffixes=[\"CO\", \"TT\", \"HE\", \"SEATS\"],\n", - " available_items_suffix=\"AV\", # [\"TRAIN_AV\", \"SM_AV\", \"CAR_AV\"] also works\n", + " items_features_patterns=[\"*_CO\", \"*_TT\", \"*_HE\", \"*_SEATS\"],\n", + " available_items_pattern=\"*_AV\", # [\"TRAIN_AV\", \"SM_AV\", \"CAR_AV\"] also works\n", " choices_column=\"CHOICE\",\n", " choice_format=\"item_index\",\n", ")" diff --git a/notebooks/introduction/2_data_handling.ipynb b/notebooks/introduction/2_data_handling.ipynb index 7da51581..ad0ab506 100644 --- a/notebooks/introduction/2_data_handling.ipynb +++ b/notebooks/introduction/2_data_handling.ipynb @@ -218,11 +218,10 @@ " shared_features_columns=[\"PURPOSE\", \"AGE\"],\n", "\n", " # Columns for items_features_by_choice\n", - " # They will be reconstructed as item_id + delimiter + feature_suffix\n", - " items_features_suffixes=[\"CO\", \"TT\"],\n", + " # They will be reconstructed as item_id replacing '*' in feature_pattern\n", + " items_features_patterns=[\"*_CO\", \"*_TT\"],\n", " # Same with availabilities\n", - " available_items_suffix=\"AV\",\n", - " delimiter=\"_\",\n", + " available_items_pattern=\"*_AV\",\n", ")" ] }, @@ -574,9 +573,8 @@ "\n", " # The new features are added here compared to example above\n", " shared_features_columns=[\"PURPOSE\", \"AGE\", \"CANTON_SURFACE\", \"CANTON_INHAB\"],\n", - " items_features_suffixes=[\"CO\", \"TT\"],\n", - " available_items_suffix=\"AV\",\n", - " delimiter=\"_\",\n", + " items_features_patterns=[\"*_CO\", \"*_TT\"],\n", + " available_items_pattern=\"*_AV\",\n", ")" ] }, From 83f9289a8102a1e5798a3e157c38f42362a9fba5 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 29 Jan 2026 13:33:41 +0000 Subject: [PATCH 05/10] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- choice_learn/data/choice_dataset.py | 6 ++++-- choice_learn/datasets/base.py | 6 ++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/choice_learn/data/choice_dataset.py b/choice_learn/data/choice_dataset.py index 1300254c..31872e46 100644 --- a/choice_learn/data/choice_dataset.py +++ b/choice_learn/data/choice_dataset.py @@ -929,8 +929,9 @@ def from_single_wide_df( shared_features_by_choice_names = None if items_features_patterns is not None: - assert all(["*" in pattern for pattern in items_features_patterns]), \ + assert all(["*" in pattern for pattern in items_features_patterns]), ( "items_features_patterns should all contain '*' character." + ) items_features_by_choice = [] for item in items_id: columns = [feature.replace("*", item) for feature in items_features_patterns] @@ -958,8 +959,9 @@ def from_single_wide_df( logging.info("Each column will be matched to an item, given their order") available_items_by_choice = df[available_items_pattern].to_numpy() else: - assert "*" in available_items_pattern, \ + assert "*" in available_items_pattern, ( "available_items_pattern should contain '*' character." + ) columns = [available_items_pattern.replace("*", item) for item in items_id] available_items_by_choice = df[columns].to_numpy() else: diff --git a/choice_learn/datasets/base.py b/choice_learn/datasets/base.py index dee10e25..fa30adc4 100644 --- a/choice_learn/datasets/base.py +++ b/choice_learn/datasets/base.py @@ -338,8 +338,10 @@ def load_swissmetro(add_items_one_hot=False, as_frame=False, return_desc=False, df=swiss_df, items_id=items, shared_features_columns=shared_features_by_choice_names, - items_features_patterns=["*_%s" % column for column in ( - items_features_by_choice_names + ["ASC_TRAIN", "ASC_SM", "ASC_CAR"])], + items_features_patterns=[ + "*_%s" % column + for column in (items_features_by_choice_names + ["ASC_TRAIN", "ASC_SM", "ASC_CAR"]) + ], available_items_pattern="*_%s" % availabilities_column, choices_column=choice_column, choice_format="items_index", From e6bf4b7c6a14f3817ee2f48d57fdef4a7615f9ce Mon Sep 17 00:00:00 2001 From: VincentAuriau Date: Tue, 3 Feb 2026 18:23:29 +0100 Subject: [PATCH 06/10] ADD: attempt at auto cleaning features name --- choice_learn/data/choice_dataset.py | 31 +++++++++++++++++++++++------ 1 file changed, 25 insertions(+), 6 deletions(-) diff --git a/choice_learn/data/choice_dataset.py b/choice_learn/data/choice_dataset.py index 31872e46..ea6e98b1 100644 --- a/choice_learn/data/choice_dataset.py +++ b/choice_learn/data/choice_dataset.py @@ -1,6 +1,7 @@ """Main classes to handle assortment data.""" import logging +import re import numpy as np import pandas as pd @@ -886,6 +887,7 @@ def from_single_wide_df( shared_features_columns=None, items_features_patterns=None, available_items_pattern=None, + patterns_ignore_chars="[^a-zA-Z0-9_]", choices_column="choice", choice_format="items_id", ): @@ -902,11 +904,16 @@ def from_single_wide_df( items_features_patterns : list of str, optional Patterns of the columns of the dataframe that are items_features_by_choice, given as "*suffix" or "prefix*" where "*" is replaced by items_id in df columns. + It is possible to specify characters to be ignored by including them between [^ and ]. default is None available_items_pattern: str, optional Pattern of the columns of the dataframe that are available_items_by_choice, given as "*suffix" or "prefix*" where "*" is replaced by items_id in df columns. default is None + patterns_ignore_chars: str or list, optional + Characters to be ignored in the patterns matching, given as a regex string + (e.g. "[^a-zA-Z0-9_]") or as a list of characters (e.g. [" ", "-", "/"]), + default is "[^a-zA-Z0-9_]" choice_column: str, optional Name of the column containing the choices, default is "choice" choice_format: str, optional @@ -929,9 +936,8 @@ def from_single_wide_df( shared_features_by_choice_names = None if items_features_patterns is not None: - assert all(["*" in pattern for pattern in items_features_patterns]), ( - "items_features_patterns should all contain '*' character." - ) + if not all(["*" in pattern for pattern in items_features_patterns]): + raise ValueError("items_features_patterns should all contain '*' character.") items_features_by_choice = [] for item in items_id: columns = [feature.replace("*", item) for feature in items_features_patterns] @@ -944,6 +950,20 @@ def from_single_wide_df( df[col] = 0 items_features_by_choice.append(df[columns].to_numpy()) items_features_by_choice = np.stack(items_features_by_choice, axis=1) + items_features_names = [ + features.replace("*", "") for features in items_features_patterns + ] + if isinstance(patterns_ignore_chars, list): + for char in patterns_ignore_chars: + items_features_names = [name.replace(char, "") for name in items_features_names] + elif isinstance(patterns_ignore_chars, str): + regex = re.compile(patterns_ignore_chars) + items_features_names = [regex.sub("", name) for name in items_features_names] + elif items_features_patterns is not None: + raise ValueError( + f"""patterns_ignore_chars should either be a list of characters, + a regex string or None, got {type(patterns_ignore_chars)}""" + ) else: items_features_by_choice = None items_features_names = None @@ -959,9 +979,8 @@ def from_single_wide_df( logging.info("Each column will be matched to an item, given their order") available_items_by_choice = df[available_items_pattern].to_numpy() else: - assert "*" in available_items_pattern, ( - "available_items_pattern should contain '*' character." - ) + if "*" not in available_items_pattern: + raise ValueError("available_items_pattern should contain '*' character.") columns = [available_items_pattern.replace("*", item) for item in items_id] available_items_by_choice = df[columns].to_numpy() else: From 64b53c81e088c7678c622d1456ed75d923667c13 Mon Sep 17 00:00:00 2001 From: VincentAuriau Date: Wed, 4 Feb 2026 15:04:53 +0100 Subject: [PATCH 07/10] ADD: excluded chars in feature names --- choice_learn/data/choice_dataset.py | 6 ++++-- choice_learn/datasets/base.py | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/choice_learn/data/choice_dataset.py b/choice_learn/data/choice_dataset.py index ea6e98b1..6f9d3609 100644 --- a/choice_learn/data/choice_dataset.py +++ b/choice_learn/data/choice_dataset.py @@ -887,7 +887,7 @@ def from_single_wide_df( shared_features_columns=None, items_features_patterns=None, available_items_pattern=None, - patterns_ignore_chars="[^a-zA-Z0-9_]", + patterns_ignore_chars="[^a-zA-Z0-9]", choices_column="choice", choice_format="items_id", ): @@ -959,6 +959,7 @@ def from_single_wide_df( elif isinstance(patterns_ignore_chars, str): regex = re.compile(patterns_ignore_chars) items_features_names = [regex.sub("", name) for name in items_features_names] + print(">>>", items_features_names) elif items_features_patterns is not None: raise ValueError( f"""patterns_ignore_chars should either be a list of characters, @@ -973,7 +974,7 @@ def from_single_wide_df( if not len(available_items_pattern) == len(items_id): raise ValueError( "You have given a list of columns for availabilities." - "We consider that it is one for each item however lenghts do not match" + "We consider that it is one for each item however lengths do not match" ) logging.info("You have given a list of columns for availabilities.") logging.info("Each column will be matched to an item, given their order") @@ -982,6 +983,7 @@ def from_single_wide_df( if "*" not in available_items_pattern: raise ValueError("available_items_pattern should contain '*' character.") columns = [available_items_pattern.replace("*", item) for item in items_id] + print(">>>", columns, available_items_pattern, items_id) available_items_by_choice = df[columns].to_numpy() else: available_items_by_choice = None diff --git a/choice_learn/datasets/base.py b/choice_learn/datasets/base.py index fa30adc4..1efe25dc 100644 --- a/choice_learn/datasets/base.py +++ b/choice_learn/datasets/base.py @@ -1060,6 +1060,7 @@ def load_hc( df=hc_df, shared_features_columns=["income"], items_features_patterns=["ich.*", "och.*", "occa.*", "icca.*"], + patterns_ignore_chars="[^a-zA-Z0-9_]", items_id=items_id, choices_column="depvar", choice_format="items_id", @@ -1205,7 +1206,6 @@ def load_londonpassenger( items_id=items, shared_features_columns=shared_features_by_choice_names, items_features_patterns=["*_%s" % s for s in items_features_by_choice_names], - delimiter="_", choices_column=choice_column, choice_format="items_index", ) From 46039095b09644359120fc0260bc503b0f6e87e1 Mon Sep 17 00:00:00 2001 From: VincentAuriau Date: Wed, 4 Feb 2026 15:05:10 +0100 Subject: [PATCH 08/10] update tests --- .../models/test_nested_logit.py | 17 +- tests/unit_tests/data/test_choice_dataset.py | 280 +++++++++--------- 2 files changed, 148 insertions(+), 149 deletions(-) diff --git a/tests/integration_tests/models/test_nested_logit.py b/tests/integration_tests/models/test_nested_logit.py index f6c80330..fd9a63f7 100644 --- a/tests/integration_tests/models/test_nested_logit.py +++ b/tests/integration_tests/models/test_nested_logit.py @@ -34,16 +34,15 @@ dataset = ChoiceDataset.from_single_wide_df( df=hc_df, shared_features_columns=["income"], - items_features_prefixes=[ - "ich", - "och", - "occa", - "icca", - "int_cooling", - "inc_cooling", - "inc_room", + items_features_patterns=[ + "ich.*", + "och.*", + "occa.*", + "icca.*", + "int_cooling.*", + "inc_cooling.*", + "inc_room.*", ], - delimiter=".", items_id=items_id, choices_column="depvar", choice_format="items_id", diff --git a/tests/unit_tests/data/test_choice_dataset.py b/tests/unit_tests/data/test_choice_dataset.py index a616afff..c09a79f1 100644 --- a/tests/unit_tests/data/test_choice_dataset.py +++ b/tests/unit_tests/data/test_choice_dataset.py @@ -667,8 +667,8 @@ def test_from_wide_df(): df=pd.DataFrame(wide_df), items_id=["it_1", "it_2"], shared_features_columns=["sh_1", "sh_2"], - items_features_suffixes=["1", "2", "3"], - available_items_suffix=["av_it_1", "av_it_2"], + items_features_patterns=["*_1", "*_2", "*_3"], + available_items_pattern="av_*", choices_column="choice", choice_format="items_id", ) @@ -685,14 +685,15 @@ def test_from_wide_df(): df=pd.DataFrame(wide_df), items_id=["it_1", "it_2"], shared_features_columns=None, - items_features_suffixes=["1", "2", "3"], - available_items_suffix=["av_it_1", "av_it_2"], + items_features_patterns=["*_1", "*_2", "*_3"], + available_items_pattern="av_*", choices_column="choice", choice_format="items_id", ) assert dataset.shared_features_by_choice is None assert dataset.shared_features_by_choice_names is None assert dataset.items_features_by_choice_names == (["1", "2", "3"],) + print(dataset.items_features_by_choice) assert ( dataset.items_features_by_choice == np.array( @@ -705,150 +706,149 @@ def test_from_wide_df(): ) ).all() - with pytest.raises(ValueError): + with pytest.raises(KeyError): ChoiceDataset.from_single_wide_df( df=pd.DataFrame(wide_df), items_id=["it_1", "it_2"], shared_features_columns=None, - items_features_suffixes=["1", "2", "3"], - available_items_suffix=["av_it_1", "av_it_2"], - available_items_prefix=["av_it_1", "av_it_2"], - choices_column="choice", - choice_format="items_id", - ) - - with pytest.raises(ValueError): - ChoiceDataset.from_single_wide_df( - df=pd.DataFrame(wide_df), - items_id=["it_1", "it_2"], - shared_features_columns=["sh_1", "sh_2"], - items_features_suffixes=["1", "2", "3"], - available_items_suffix=["av_it_1", "av_it_2", "av_it_3"], - choices_column="choice", - choice_format="items_id", - ) - - dataset = ChoiceDataset.from_single_wide_df( - df=pd.DataFrame(wide_df), - items_id=["it_1", "it_2"], - shared_features_columns=["sh_1", "sh_2"], - items_features_suffixes=["1", "2"], - available_items_prefix=["av_it_1", "av_it_2"], - choices_column="choice", - choice_format="items_id", - ) - assert ( - dataset.available_items_by_choice == np.array([[1.0, 1.0], [1.0, 0.0], [1.0, 1.0]]) - ).all() - assert (dataset.choices == np.array([0, 0, 1])).all() - with pytest.raises(ValueError): - ChoiceDataset.from_single_wide_df( - df=pd.DataFrame(wide_df), - items_id=["it_1", "it_2"], - shared_features_columns=["sh_1", "sh_2"], - items_features_suffixes=["1", "2"], - available_items_prefix=["av_it_1", "av_it_2", "av_it_3"], - choices_column="choice", - choice_format="items_id", - ) - dataset = ChoiceDataset.from_single_wide_df( - df=pd.DataFrame(wide_df), - items_id=["it_1", "it_2"], - shared_features_columns=["sh_1", "sh_2"], - items_features_suffixes=["1", "2"], - available_items_prefix="av", - choices_column="choice", - choice_format="items_id", - ) - assert ( - dataset.available_items_by_choice == np.array([[1.0, 1.0], [1.0, 0.0], [1.0, 1.0]]) - ).all() - assert (dataset.choices == np.array([0, 0, 1])).all() - - with pytest.raises(ValueError): - ChoiceDataset.from_single_wide_df( - df=pd.DataFrame(wide_df), - items_id=None, - shared_features_columns=["sh_1", "sh_2"], - choices_column="choice", - choice_format="items_id", - ) - with pytest.raises(ValueError): - wide_df_false = { - "sh_1": [1.1, 2.2, 3.3], - "sh_2": [11.1, 22.2, 33.3], - "it_1_1": [0.4, 0.5, 0.6], - "it_2_1": [0.7, 0.8, 0.9], - "it_1_2": [1.4, 1.5, 1.6], - "it_2_2": [1.7, 1.8, 1.9], - "it_1_3": [2.4, 2.5, 2.6], - "it_2_3": [2.7, 2.8, 2.9], - "av_it_1": [1, 1, 1], - "av_it_2": [1, 0, 1], - "choice": ["it_3", "it_3", "it_4"], - } - ChoiceDataset.from_single_wide_df( - df=pd.DataFrame(wide_df_false), - items_id=["it_1", "it_2"], - shared_features_columns=["sh_1", "sh_2"], - items_features_suffixes=["1", "2"], - available_items_prefix="av", + items_features_patterns=["*_1", "*_2", "*_3"], + available_items_pattern="*av_*", choices_column="choice", choice_format="items_id", ) - extra_wide_df = { - "sh_1": [1.1, 2.2, 3.3], - "sh_2": [11.1, 22.2, 33.3], - "it_1_1": [0.4, 0.5, 0.6], - "it_2_1": [0.7, 0.8, 0.9], - "it_1_2": [1.4, 1.5, 1.6], - "it_2_2": [1.7, 1.8, 1.9], - "it_1_3": [2.4, 2.5, 2.6], - "it_2_3": [2.7, 2.8, 2.9], - "1_it_1": [3.4, 3.5, 3.6], - "1_it_2": [3.7, 3.8, 3.9], - "2_it_1": [4.4, 4.5, 4.6], - "2_it_2": [4.7, 4.8, 4.9], - "3_it_1": [5.4, 5.5, 5.6], - "3_it_2": [5.7, 5.8, 5.9], - "av_it_1": [1, 1, 1], - "av_it_2": [1, 0, 1], - "choice": ["it_1", "it_1", "it_2"], - } - dataset = ChoiceDataset.from_single_wide_df( - df=pd.DataFrame(extra_wide_df), - items_id=["it_1", "it_2"], - shared_features_columns=None, - items_features_prefixes=["1", "2", "3"], - items_features_suffixes=["1", "2", "3"], - available_items_suffix=["av_it_1", "av_it_2"], - choices_column="choice", - choice_format="items_id", - ) - assert dataset.shared_features_by_choice is None - assert dataset.shared_features_by_choice_names is None - assert dataset.items_features_by_choice_names == (["1", "2", "3", "1", "2", "3"],) - assert ( - dataset.items_features_by_choice - == np.array( - [ - [ - [3.4, 4.4, 5.4, 0.4, 1.4, 2.4], - [3.7, 4.7, 5.7, 0.7, 1.7, 2.7], - ], - [ - [3.5, 4.5, 5.5, 0.5, 1.5, 2.5], - [3.8, 4.8, 5.8, 0.8, 1.8, 2.8], - ], - [ - [3.6, 4.6, 5.6, 0.6, 1.6, 2.6], - [3.9, 4.9, 5.9, 0.9, 1.9, 2.9], - ], - ], - dtype=np.float64, - ) - ).all() + # with pytest.raises(ValueError): + # ChoiceDataset.from_single_wide_df( + # df=pd.DataFrame(wide_df), + # items_id=["it_1", "it_2"], + # shared_features_columns=["sh_1", "sh_2"], + # items_features_suffixes=["1", "2", "3"], + # available_items_suffix=["av_it_1", "av_it_2", "av_it_3"], + # choices_column="choice", + # choice_format="items_id", + # ) + + # dataset = ChoiceDataset.from_single_wide_df( + # df=pd.DataFrame(wide_df), + # items_id=["it_1", "it_2"], + # shared_features_columns=["sh_1", "sh_2"], + # items_features_suffixes=["1", "2"], + # available_items_prefix=["av_it_1", "av_it_2"], + # choices_column="choice", + # choice_format="items_id", + # ) + # assert ( + # dataset.available_items_by_choice == np.array([[1.0, 1.0], [1.0, 0.0], [1.0, 1.0]]) + # ).all() + # assert (dataset.choices == np.array([0, 0, 1])).all() + # with pytest.raises(ValueError): + # ChoiceDataset.from_single_wide_df( + # df=pd.DataFrame(wide_df), + # items_id=["it_1", "it_2"], + # shared_features_columns=["sh_1", "sh_2"], + # items_features_suffixes=["1", "2"], + # available_items_prefix=["av_it_1", "av_it_2", "av_it_3"], + # choices_column="choice", + # choice_format="items_id", + # ) + # dataset = ChoiceDataset.from_single_wide_df( + # df=pd.DataFrame(wide_df), + # items_id=["it_1", "it_2"], + # shared_features_columns=["sh_1", "sh_2"], + # items_features_suffixes=["1", "2"], + # available_items_prefix="av", + # choices_column="choice", + # choice_format="items_id", + # ) + # assert ( + # dataset.available_items_by_choice == np.array([[1.0, 1.0], [1.0, 0.0], [1.0, 1.0]]) + # ).all() + # assert (dataset.choices == np.array([0, 0, 1])).all() + + # with pytest.raises(ValueError): + # ChoiceDataset.from_single_wide_df( + # df=pd.DataFrame(wide_df), + # items_id=None, + # shared_features_columns=["sh_1", "sh_2"], + # choices_column="choice", + # choice_format="items_id", + # ) + # with pytest.raises(ValueError): + # wide_df_false = { + # "sh_1": [1.1, 2.2, 3.3], + # "sh_2": [11.1, 22.2, 33.3], + # "it_1_1": [0.4, 0.5, 0.6], + # "it_2_1": [0.7, 0.8, 0.9], + # "it_1_2": [1.4, 1.5, 1.6], + # "it_2_2": [1.7, 1.8, 1.9], + # "it_1_3": [2.4, 2.5, 2.6], + # "it_2_3": [2.7, 2.8, 2.9], + # "av_it_1": [1, 1, 1], + # "av_it_2": [1, 0, 1], + # "choice": ["it_3", "it_3", "it_4"], + # } + # ChoiceDataset.from_single_wide_df( + # df=pd.DataFrame(wide_df_false), + # items_id=["it_1", "it_2"], + # shared_features_columns=["sh_1", "sh_2"], + # items_features_suffixes=["1", "2"], + # available_items_prefix="av", + # choices_column="choice", + # choice_format="items_id", + # ) + + # extra_wide_df = { + # "sh_1": [1.1, 2.2, 3.3], + # "sh_2": [11.1, 22.2, 33.3], + # "it_1_1": [0.4, 0.5, 0.6], + # "it_2_1": [0.7, 0.8, 0.9], + # "it_1_2": [1.4, 1.5, 1.6], + # "it_2_2": [1.7, 1.8, 1.9], + # "it_1_3": [2.4, 2.5, 2.6], + # "it_2_3": [2.7, 2.8, 2.9], + # "1_it_1": [3.4, 3.5, 3.6], + # "1_it_2": [3.7, 3.8, 3.9], + # "2_it_1": [4.4, 4.5, 4.6], + # "2_it_2": [4.7, 4.8, 4.9], + # "3_it_1": [5.4, 5.5, 5.6], + # "3_it_2": [5.7, 5.8, 5.9], + # "av_it_1": [1, 1, 1], + # "av_it_2": [1, 0, 1], + # "choice": ["it_1", "it_1", "it_2"], + # } + # dataset = ChoiceDataset.from_single_wide_df( + # df=pd.DataFrame(extra_wide_df), + # items_id=["it_1", "it_2"], + # shared_features_columns=None, + # items_features_prefixes=["1", "2", "3"], + # items_features_suffixes=["1", "2", "3"], + # available_items_suffix=["av_it_1", "av_it_2"], + # choices_column="choice", + # choice_format="items_id", + # ) + # assert dataset.shared_features_by_choice is None + # assert dataset.shared_features_by_choice_names is None + # assert dataset.items_features_by_choice_names == (["1", "2", "3", "1", "2", "3"],) + # assert ( + # dataset.items_features_by_choice + # == np.array( + # [ + # [ + # [3.4, 4.4, 5.4, 0.4, 1.4, 2.4], + # [3.7, 4.7, 5.7, 0.7, 1.7, 2.7], + # ], + # [ + # [3.5, 4.5, 5.5, 0.5, 1.5, 2.5], + # [3.8, 4.8, 5.8, 0.8, 1.8, 2.8], + # ], + # [ + # [3.6, 4.6, 5.6, 0.6, 1.6, 2.6], + # [3.9, 4.9, 5.9, 0.9, 1.9, 2.9], + # ], + # ], + # dtype=np.float64, + # ) + # ).all() def test_summary(): From 2483024569b71f9799f771b266c8715af020bd4f Mon Sep 17 00:00:00 2001 From: VincentAuriau Date: Wed, 4 Feb 2026 17:36:47 +0100 Subject: [PATCH 09/10] fix london --- choice_learn/datasets/base.py | 1 + 1 file changed, 1 insertion(+) diff --git a/choice_learn/datasets/base.py b/choice_learn/datasets/base.py index 1efe25dc..f2de778f 100644 --- a/choice_learn/datasets/base.py +++ b/choice_learn/datasets/base.py @@ -1206,6 +1206,7 @@ def load_londonpassenger( items_id=items, shared_features_columns=shared_features_by_choice_names, items_features_patterns=["*_%s" % s for s in items_features_by_choice_names], + patterns_ignore_chars="[^a-zA-Z0-9_]", choices_column=choice_column, choice_format="items_index", ) From 03115330d4d21ec2f3364be7e7a87cfa13d1dec2 Mon Sep 17 00:00:00 2001 From: VincentAuriau Date: Thu, 5 Feb 2026 19:29:20 +0100 Subject: [PATCH 10/10] test: new config for dataset instantiation --- choice_learn/datasets/base.py | 120 ++++++++++-------- .../models/test_nested_logit.py | 3 +- 2 files changed, 66 insertions(+), 57 deletions(-) diff --git a/choice_learn/datasets/base.py b/choice_learn/datasets/base.py index f2de778f..e9030626 100644 --- a/choice_learn/datasets/base.py +++ b/choice_learn/datasets/base.py @@ -212,6 +212,9 @@ def load_swissmetro(add_items_one_hot=False, as_frame=False, return_desc=False, "DEST", ] items_features_by_choice_names = ["CO", "TT", "HE", "SEATS"] + for feature in items_features_by_choice_names + ["AV"]: + for item in items: + swiss_df = swiss_df.rename(columns={f"{item}_{feature}": f"{item}-{feature}"}) choice_column = "CHOICE" availabilities_column = "AV" @@ -220,9 +223,9 @@ def load_swissmetro(add_items_one_hot=False, as_frame=False, return_desc=False, for item in items: for item2 in items: if item == item2: - swiss_df[f"{item}_oh_{item}"] = 1 + swiss_df[f"{item}-oh_{item}"] = 1 else: - swiss_df[f"{item2}_oh_{item}"] = 0 + swiss_df[f"{item2}-oh_{item}"] = 0 if return_desc: return description @@ -244,15 +247,15 @@ def load_swissmetro(add_items_one_hot=False, as_frame=False, return_desc=False, } for item_index, item_id in enumerate(["TRAIN", "SM", "CAR"]): - if row[f"{item_id}_AV"] > 0: + if row[f"{item_id}-AV"] > 0: if item_index == row.CHOICE: df_dict["CHOICE"].append(1) else: df_dict["CHOICE"].append(0) df_dict["item_id"].append(item_id) - df_dict["TT"].append(row[f"{item_id}_TT"]) - df_dict["CO"].append(row[f"{item_id}_CO"]) + df_dict["TT"].append(row[f"{item_id}-TT"]) + df_dict["CO"].append(row[f"{item_id}-CO"]) df_dict["PURPOSE"].append(row["PURPOSE"]) df_dict["AGE"].append(row["AGE"]) @@ -266,17 +269,17 @@ def load_swissmetro(add_items_one_hot=False, as_frame=False, return_desc=False, if preprocessing == "tastenet": swiss_df = swiss_df.loc[swiss_df.AGE != 6] - swiss_df["TRAIN_ASC_TRAIN"] = 1.0 - swiss_df["SM_ASC_TRAIN"] = 0.0 - swiss_df["CAR_ASC_TRAIN"] = 0.0 + swiss_df["TRAIN-ASC_TRAIN"] = 1.0 + swiss_df["SM-ASC_TRAIN"] = 0.0 + swiss_df["CAR-ASC_TRAIN"] = 0.0 - swiss_df["TRAIN_ASC_SM"] = 0.0 - swiss_df["SM_ASC_SM"] = 1.0 - swiss_df["CAR_ASC_SM"] = 0.0 + swiss_df["TRAIN-ASC_SM"] = 0.0 + swiss_df["SM-ASC_SM"] = 1.0 + swiss_df["CAR-ASC_SM"] = 0.0 - swiss_df["TRAIN_ASC_CAR"] = 0.0 - swiss_df["SM_ASC_CAR"] = 0.0 - swiss_df["CAR_ASC_CAR"] = 1.0 + swiss_df["TRAIN-ASC_CAR"] = 0.0 + swiss_df["SM-ASC_CAR"] = 0.0 + swiss_df["CAR-ASC_CAR"] = 1.0 swiss_df["FEMALE"] = 1 - swiss_df["MALE"] shared_features_by_choice_names = ["MALE", "FEMALE"] @@ -319,18 +322,18 @@ def load_swissmetro(add_items_one_hot=False, as_frame=False, return_desc=False, swiss_df = pd.concat([swiss_df, luggage_dummy], axis=1) shared_features_by_choice_names += luggage_dummy.columns.to_list() - swiss_df["SM_CO"] = swiss_df["SM_CO"] * (swiss_df["GA"] == 0) - swiss_df["TRAIN_CO"] = swiss_df["TRAIN_CO"] * (swiss_df["GA"] == 0) + swiss_df["SM-CO"] = swiss_df["SM-CO"] * (swiss_df["GA"] == 0) + swiss_df["TRAIN-CO"] = swiss_df["TRAIN-CO"] * (swiss_df["GA"] == 0) for col in [ - "TRAIN_TT", - "TRAIN_HE", - "TRAIN_CO", - "SM_TT", - "SM_HE", - "SM_CO", - "CAR_TT", - "CAR_CO", + "TRAIN-TT", + "TRAIN-HE", + "TRAIN-CO", + "SM-TT", + "SM-HE", + "SM-CO", + "CAR-TT", + "CAR-CO", ]: swiss_df[col] = swiss_df[col] / 100 @@ -340,9 +343,10 @@ def load_swissmetro(add_items_one_hot=False, as_frame=False, return_desc=False, shared_features_columns=shared_features_by_choice_names, items_features_patterns=[ "*_%s" % column - for column in (items_features_by_choice_names + ["ASC_TRAIN", "ASC_SM", "ASC_CAR"]) + for column in (items_features_by_choice_names + ["ASC-TRAIN", "ASC-SM", "ASC-CAR"]) ], - available_items_pattern="*_%s" % availabilities_column, + available_items_pattern="*-%s" % availabilities_column, + patterns_ignore_chars="[^a-zA-Z0-9_]", choices_column=choice_column, choice_format="items_index", ) @@ -354,8 +358,8 @@ def load_swissmetro(add_items_one_hot=False, as_frame=False, return_desc=False, swiss_df = swiss_df.loc[swiss_df.PURPOSE.isin([1, 3])] # Normalizing values - swiss_df[["TRAIN_TT", "SM_TT", "CAR_TT"]] = swiss_df[["TRAIN_TT", "SM_TT", "CAR_TT"]] / 60.0 - swiss_df[["TRAIN_HE", "SM_HE"]] = swiss_df[["TRAIN_HE", "SM_HE"]] / 60.0 + swiss_df[["TRAIN-TT", "SM-TT", "CAR-TT"]] = swiss_df[["TRAIN-TT", "SM-TT", "CAR-TT"]] / 60.0 + swiss_df[["TRAIN-HE", "SM-HE"]] = swiss_df[["TRAIN-HE", "SM-HE"]] / 60.0 swiss_df["train_free_ticket"] = swiss_df.apply( lambda row: ((row["GA"] == 1 or row["WHO"] == 2) > 0).astype(int), axis=1 @@ -366,12 +370,12 @@ def load_swissmetro(add_items_one_hot=False, as_frame=False, return_desc=False, swiss_df["car_free_ticket"] = 0 swiss_df["train_travel_cost"] = swiss_df.apply( - lambda row: (row["TRAIN_CO"] * (1 - row["train_free_ticket"])) / 100, axis=1 + lambda row: (row["TRAIN-CO"] * (1 - row["train_free_ticket"])) / 100, axis=1 ) swiss_df["sm_travel_cost"] = swiss_df.apply( - lambda row: (row["SM_CO"] * (1 - row["sm_free_ticket"])) / 100, axis=1 + lambda row: (row["SM-CO"] * (1 - row["sm_free_ticket"])) / 100, axis=1 ) - swiss_df["car_travel_cost"] = swiss_df.apply(lambda row: row["CAR_CO"] / 100, axis=1) + swiss_df["car_travel_cost"] = swiss_df.apply(lambda row: row["CAR-CO"] / 100, axis=1) swiss_df["single_luggage_piece"] = swiss_df.apply( lambda row: (row["LUGGAGE"] == 1).astype(int), axis=1 @@ -385,9 +389,9 @@ def load_swissmetro(add_items_one_hot=False, as_frame=False, return_desc=False, shared_features_by_choice = swiss_df[ ["train_survey", "regular_class", "single_luggage_piece", "multiple_luggage_piece"] ].to_numpy() - train_features = swiss_df[["train_travel_cost", "TRAIN_TT", "TRAIN_HE"]].to_numpy() - sm_features = swiss_df[["sm_travel_cost", "SM_TT", "SM_HE", "SM_SEATS"]].to_numpy() - car_features = swiss_df[["car_travel_cost", "CAR_TT"]].to_numpy() + train_features = swiss_df[["train_travel_cost", "TRAIN-TT", "TRAIN-HE"]].to_numpy() + sm_features = swiss_df[["sm_travel_cost", "SM-TT", "SM-HE", "SM-SEATS"]].to_numpy() + car_features = swiss_df[["car_travel_cost", "CAR-TT"]].to_numpy() # We need to have the same number of features for each item, we create dummy ones: car_features = np.concatenate([car_features, np.zeros((len(car_features), 2))], axis=1) @@ -396,7 +400,7 @@ def load_swissmetro(add_items_one_hot=False, as_frame=False, return_desc=False, ) items_features_by_choice = np.stack([train_features, sm_features, car_features], axis=1) - available_items_by_choice = swiss_df[["TRAIN_AV", "SM_AV", "CAR_AV"]].to_numpy() + available_items_by_choice = swiss_df[["TRAIN-AV", "SM-AV", "CAR-AV"]].to_numpy() # Re-Indexing choices from 1 to 3 to 0 to 2 choices = swiss_df.CHOICE.to_numpy() @@ -418,32 +422,31 @@ def load_swissmetro(add_items_one_hot=False, as_frame=False, return_desc=False, swiss_df = swiss_df.loc[swiss_df.PURPOSE.isin([1, 3])] # Normalizing values by 100 - swiss_df[["TRAIN_TT", "SM_TT", "CAR_TT"]] = ( - swiss_df[["TRAIN_TT", "SM_TT", "CAR_TT"]] / 100.0 + swiss_df[["TRAIN-TT", "SM-TT", "CAR-TT"]] = ( + swiss_df[["TRAIN-TT", "SM-TT", "CAR-TT"]] / 100.0 ) - swiss_df["train_free_ticket"] = swiss_df.apply( + swiss_df["train-free_ticket"] = swiss_df.apply( lambda row: (row["GA"] == 1).astype(int), axis=1 ) - swiss_df["sm_free_ticket"] = swiss_df.apply( + swiss_df["sm-free_ticket"] = swiss_df.apply( lambda row: (row["GA"] == 1).astype(int), axis=1 ) - swiss_df["train_travel_cost"] = swiss_df.apply( - lambda row: (row["TRAIN_CO"] * (1 - row["train_free_ticket"])) / 100, axis=1 + swiss_df["train-travel_cost"] = swiss_df.apply( + lambda row: (row["TRAIN-CO"] * (1 - row["train-free_ticket"])) / 100, axis=1 ) - swiss_df["sm_travel_cost"] = swiss_df.apply( - lambda row: (row["SM_CO"] * (1 - row["sm_free_ticket"])) / 100, axis=1 + swiss_df["sm-travel_cost"] = swiss_df.apply( + lambda row: (row["SM-CO"] * (1 - row["sm-free_ticket"])) / 100, axis=1 ) - swiss_df["car_travel_cost"] = swiss_df.apply(lambda row: row["CAR_CO"] / 100, axis=1) - - train_features = swiss_df[["train_travel_cost", "TRAIN_TT"]].to_numpy() - sm_features = swiss_df[["sm_travel_cost", "SM_TT"]].to_numpy() - car_features = swiss_df[["car_travel_cost", "CAR_TT"]].to_numpy() + swiss_df["car-travel_cost"] = swiss_df.apply(lambda row: row["CAR-CO"] / 100, axis=1) + train_features = swiss_df[["train-travel_cost", "TRAIN-TT"]].to_numpy() + sm_features = swiss_df[["sm-travel_cost", "SM-TT"]].to_numpy() + car_features = swiss_df[["car-travel_cost", "CAR-TT"]].to_numpy() items_features_by_choice = np.stack([train_features, sm_features, car_features], axis=1) - available_items_by_choice = swiss_df[["TRAIN_AV", "SM_AV", "CAR_AV"]].to_numpy() + available_items_by_choice = swiss_df[["TRAIN-AV", "SM-AV", "CAR-AV"]].to_numpy() # Re-Indexing choices from 1 to 3 to 0 to 2 choices = swiss_df.CHOICE.to_numpy() @@ -459,12 +462,12 @@ def load_swissmetro(add_items_one_hot=False, as_frame=False, return_desc=False, swiss_df["One"] = 1.0 swiss_df["Zero"] = 0.0 - available_items_by_choice = swiss_df[["TRAIN_AV", "SM_AV", "CAR_AV"]].to_numpy() + available_items_by_choice = swiss_df[["TRAIN-AV", "SM-AV", "CAR-AV"]].to_numpy() items_features_by_choice = np.stack( [ - swiss_df[["One", "Zero", "Zero", "TRAIN_TT", "TRAIN_CO", "TRAIN_HE"]].to_numpy(), - swiss_df[["Zero", "One", "Zero", "SM_TT", "SM_CO", "SM_HE"]].to_numpy(), - swiss_df[["Zero", "Zero", "One", "CAR_TT", "CAR_CO", "CAR_HE"]].to_numpy(), + swiss_df[["One", "Zero", "Zero", "TRAIN-TT", "TRAIN-CO", "TRAIN-HE"]].to_numpy(), + swiss_df[["Zero", "One", "Zero", "SM-TT", "SM-CO", "SM-HE"]].to_numpy(), + swiss_df[["Zero", "Zero", "One", "CAR-TT", "CAR-CO", "CAR-HE"]].to_numpy(), ], axis=1, ) @@ -534,8 +537,9 @@ def load_swissmetro(add_items_one_hot=False, as_frame=False, return_desc=False, df=swiss_df, items_id=items, shared_features_columns=shared_features_by_choice_names, - items_features_patterns=["*_%s" % s for s in items_features_by_choice_names], - available_items_pattern="*_%s" % availabilities_column, + items_features_patterns=["*-%s" % s for s in items_features_by_choice_names], + available_items_pattern="*-%s" % availabilities_column, + patterns_ignore_chars="[^a-zA-Z0-9_]", choices_column=choice_column, choice_format="items_index", ) @@ -930,6 +934,7 @@ def load_train( items_id=["1", "2"], shared_features_columns=["id"], items_features_patterns=["price*", "time*", "change*", "comfort*"], + patterns_ignore_chars="[^a-zA-Z0-9_]", choices_column="choice", choice_format="items_id", ) @@ -1201,11 +1206,14 @@ def load_londonpassenger( # Shift the index of the travel mode to start at 0 london_df["travel_mode"] = london_df["travel_mode"] - 1 + for feat in items_features_by_choice_names: + for item in items: + london_df = london_df.rename(columns={f"{item}_{feat}": f"{item}-{feat}"}) return ChoiceDataset.from_single_wide_df( df=london_df, items_id=items, shared_features_columns=shared_features_by_choice_names, - items_features_patterns=["*_%s" % s for s in items_features_by_choice_names], + items_features_patterns=["*-%s" % s for s in items_features_by_choice_names], patterns_ignore_chars="[^a-zA-Z0-9_]", choices_column=choice_column, choice_format="items_index", diff --git a/tests/integration_tests/models/test_nested_logit.py b/tests/integration_tests/models/test_nested_logit.py index fd9a63f7..2ddd316d 100644 --- a/tests/integration_tests/models/test_nested_logit.py +++ b/tests/integration_tests/models/test_nested_logit.py @@ -43,13 +43,14 @@ "inc_cooling.*", "inc_room.*", ], + patterns_ignore_chars="[^a-zA-Z0-9_]", items_id=items_id, choices_column="depvar", choice_format="items_id", ) -def test_fit_hc_formul_1(): +def test_fit_hc_formula_1(): """Tests specific config of NestedLogit on HC dataset.""" tf.config.run_functions_eagerly(True) global dataset