
@Article{LledoEspinosaPerez_SD_2025,
AUTHOR = {Lledó, Josep and Espinosa, Priscila and Pérez, Virgilio},
TITLE = {A dataset for health insurance analysis: Integrating individual and area-based contextual variables},
JOURNAL = {Scientific Data},
YEAR = {2025},
URL = {https://doi.org/10.1038/s41597-025-06372-z},
ISSN = {2052-4463},
ABSTRACT = {Access to real data is a challenge for research and professional analysis in the insurance sector, especially since such access is often restricted due to confidentiality and competitive issues, particularly in health insurance. This paper introduces a new dataset from a Spanish health insurance portfolio, covering the years 2017 to 2019 with over 70 thousand unique insured and more than 225 thousand rows of data. The dataset contains 42 variables, of which 27 are directly sourced from the insurer and the rest are derived to include area-based contextual information obtained from publicly available sources. The data are anonymized to ensure privacy while maintaining the integrity required for robust professional analysis. Researchers can use the dataset to explore health insurance dynamics, from product design to contextual effects and risk management. Moreover, it supports academic applications for students and educators where they can use real-world data for exercises in data cleaning, statistical analysis and machine learning models.},
DOI = {10.1038/s41597-025-06372-z}
}



