@Article{electronics15051123,
AUTHOR = {Mjahad, Azeddine and Rosado-Muñoz, Alfredo},
TITLE = {Automated Vegetable Classification Using Hybrid CNN and Vision Transformer Models for Food Quality Assessment},
JOURNAL = {Electronics},
VOLUME = {15},
YEAR = {2026},
NUMBER = {5},
ARTICLE-NUMBER = {1123},
URL = {https://www.mdpi.com/2079-9292/15/5/1123},
ISSN = {2079-9292},
ABSTRACT = {The food industry increasingly relies on automated vision systems to ensure product quality, consistency, and safety. However, the visual classification of vegetables remains challenging due to high intra-class variability, illumination differences, and subtle morphological similarities between categories. This study evaluates the effectiveness of combining CNNs with four advanced Vision Transformer (ViT) architectures: DeiT (Data-efficient Image Transformer), CoaT (Co-Scale Conv-Attentional Transformer), CvT (Convolutional Vision Transformer), CrossViT (Cross-Attention Vision Transformer) for the automatic classification of 15 vegetable types. All models were implemented within a unified CNN–ViT hybrid framework to enhance both local feature extraction and global contextual reasoning. We processed all images under identical conditions to ensure a fair comparison and reproducibility. Results demonstrate that the hybrid architectures significantly outperform the standalone CNN baseline, with CvT achieving an approximate global accuracy in the range of 96.6–98.88% and consistently strong performance across visually complex classes such as cabbage, brinjal, and pumpkin. These findings confirm that hybrid CNN–ViT models are highly effective for visual food analysis, offering a robust and scalable solution for quality control, automated inspection, and classification of agricultural products. The methodology presented here may also be extended to other food items, including gels and processed products, highlighting its versatility and industrial relevance.},
DOI = {10.3390/electronics15051123}
}