<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.4 20241031//EN" "JATS-journalpublishing1-4.dtd">
<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="1.4" xml:lang="en">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">eng</journal-id>
      <journal-title-group>
        <journal-title>Engineering</journal-title>
      </journal-title-group>
      <issn pub-type="epub">1947-394X</issn>
      <issn pub-type="ppub">1947-3931</issn>
      <publisher>
        <publisher-name>Scientific Research Publishing</publisher-name>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="doi">10.4236/eng.2026.184010</article-id>
      <article-id pub-id-type="publisher-id">eng-150892</article-id>
      <article-categories>
        <subj-group>
          <subject>Article</subject>
        </subj-group>
        <subj-group>
          <subject>Engineering</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Real-Time Recognition of Three Facial Emotions (“Surprise, Neutral, Happy”) Based on CNN with Augmented Data</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="author" corresp="yes">
          <name name-style="western">
            <surname>N’Drin</surname>
            <given-names>Hugues Auguste</given-names>
          </name>
          <xref ref-type="aff" rid="aff1">1</xref>
          <xref ref-type="aff" rid="aff2">2</xref>
        </contrib>
        <contrib contrib-type="author">
          <name name-style="western">
            <surname>Konan</surname>
            <given-names>Hyacinthe Kouassi</given-names>
          </name>
          <xref ref-type="aff" rid="aff1">1</xref>
        </contrib>
        <contrib contrib-type="author">
          <name name-style="western">
            <surname>Soro</surname>
            <given-names>Etienne Téna</given-names>
          </name>
          <xref ref-type="aff" rid="aff1">1</xref>
        </contrib>
        <contrib contrib-type="author">
          <name name-style="western">
            <surname>Asseu</surname>
            <given-names>Olivier</given-names>
          </name>
          <xref ref-type="aff" rid="aff1">1</xref>
          <xref ref-type="aff" rid="aff2">2</xref>
        </contrib>
      </contrib-group>
      <aff id="aff1"><label>1</label> Laboratoire des Sciences, des Technologies de l’Information et de la Communication en Abrégé (LASTIC), Ecole Supérieure Africaine des Technologies de l’Information et de la Communication (ESATIC), Abidjan, Côte d’Ivoire </aff>
      <aff id="aff2"><label>2</label> Institut National Polytechnique Félix Houphouët-Boigny (INPHB), École Doctorale Polytechnique (EDP)-Sciences et Techniques de l’Ingénieur (STI), Yamoussoukro, Côte d’Ivoire </aff>
      <author-notes>
        <fn fn-type="conflict" id="fn-conflict">
          <p>The authors declare no conflicts of interest regarding the publication of this paper.</p>
        </fn>
      </author-notes>
      <pub-date pub-type="epub">
        <day>21</day>
        <month>04</month>
        <year>2026</year>
      </pub-date>
      <pub-date pub-type="collection">
        <month>04</month>
        <year>2026</year>
      </pub-date>
      <volume>18</volume>
      <issue>04</issue>
      <fpage>145</fpage>
      <lpage>154</lpage>
      <history>
        <date date-type="received">
          <day>25</day>
          <month>02</month>
          <year>2026</year>
        </date>
        <date date-type="accepted">
          <day>20</day>
          <month>04</month>
          <year>2026</year>
        </date>
        <date date-type="published">
          <day>23</day>
          <month>04</month>
          <year>2026</year>
        </date>
      </history>
      <permissions>
        <copyright-statement>© 2026 by the authors and Scientific Research Publishing Inc.</copyright-statement>
        <copyright-year>2026</copyright-year>
        <license license-type="open-access">
          <license-p> This article is an open access article distributed under the terms and conditions of the Creative Commons Attribution (CC BY) license ( <ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link> ). </license-p>
        </license>
      </permissions>
      <self-uri content-type="doi" xlink:href="https://doi.org/10.4236/eng.2026.184010">https://doi.org/10.4236/eng.2026.184010</self-uri>
      <abstract>
        <p>Emotion recognition from facial expressions has become essential for applications such as human-computer collaboration, robot communication, and interactive interfaces [<xref ref-type="bibr" rid="B1">1</xref>]. This work proposes a real-time recognition system (<italic>i.e.</italic>, the system produces a prediction with a latency low enough for smooth interaction, typically less than 100 ms per image or greater than 10 frames per second) capable of classifying three emotions: surprise, neutrality, and joy from facial images. The model is based on a convolutional neural network (CNN) optimized by data augmentation techniques applied to the FER2013 dataset [<xref ref-type="bibr" rid="B2">2</xref>] (Data augmentation was applied only to the training subset, not before distribution). The CNN has three convolutional layers, four fully connected layers, and uses ReLU and Softmax functions. The proposed approach achieves a validation accuracy of 89%, maintains high and balanced recognition rates for each class, and is capable of processing slightly distorted faces (<italic>i.e.</italic>, faces with small geometric or photometric variations, such as rotations, translations, scaling changes, or partial expressions) [<xref ref-type="bibr" rid="B3">3</xref>]. These results demonstrate the feasibility of fast, robust emotion recognition applicable to real-time interactive scenarios.</p>
      </abstract>
      <kwd-group kwd-group-type="author-generated" xml:lang="en">
        <kwd>Convolutional Neural Network</kwd>
        <kwd>Data Augmentation</kwd>
        <kwd>Validation Accuracy</kwd>
        <kwd>Emotion Detection</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec id="sec1">
      <title>1. Introduction</title>
      <p>Facial expressions convey a person’s emotions through facial muscle movements and are a reliable indicator of mental state. Facial expression analysis has numerous applications, including lie detection, social robotics, and data-driven animation [<xref ref-type="bibr" rid="B4">4</xref>]. For an intelligent agent or robot to interact effectively with humans, accurate emotion recognition is crucial [<xref ref-type="bibr" rid="B5">5</xref>]. Research on facial recognition has shown considerable progress, but challenges remain, particularly in maintaining a balanced recognition rate (defined as consistent performance across different classes, typically measured by averaging recalls per class to prevent a dominant class from biasing results) between different emotions [<xref ref-type="bibr" rid="B6">6</xref>]. This study focuses on three key emotions (surprise, neutral, happy) to optimize real-time recognition, simplify the classification problem, and reduce errors related to less represented classes [<xref ref-type="bibr" rid="B7">7</xref>][<xref ref-type="bibr" rid="B8">8</xref>].</p>
      <p>The use of CNNs, combined with data augmentation techniques, forms the core of this approach [<xref ref-type="bibr" rid="B3">3</xref>]. The following sections describe the related work, methodology, data collection and preprocessing, augmentation, system implementation, results, and finally, conclusions and future prospects.</p>
      <p><bold>Previous</bold><bold>work</bold></p>
      <p>Facial expression recognition has been the subject of research for several years. CNNs have become the dominant approach due to their ability to automatically extract discriminating features. FER2013 was used to classify seven emotions, achieving acceptable accuracy. However, some classes, such as “disgust” and “fear”, exhibited very low recognition rates (45% and 41%, respectively) [<xref ref-type="bibr" rid="B8">8</xref>]. Recent models combining deep CNNs and residual blocks have improved overall accuracy (85.24%) on CK+ and JAFFE, but remain limited by the small number of datasets and the inability to handle distorted faces [<xref ref-type="bibr" rid="B4">4</xref>]. Data augmentation has proven crucial for improving model robustness, as demonstrated by recent work on FER2013 and RAF-DB [<xref ref-type="bibr" rid="B5">5</xref>]. This work shows that combining an efficient CNN with targeted data augmentation allows for robust performance while maintaining a balanced recognition rate for all classes [<xref ref-type="bibr" rid="B3">3</xref>]. For balanced performance, it is important to supplement overall accuracy with class-specific metrics. Specifically, for each of the three emotions, the following should be reported: 1) Accuracy (the proportion of correct predictions among those assigned to a given class), 2) Recall (the proportion of correctly identified examples among all real-life examples of that class), 3) The F1 score (the harmonic mean of accuracy and recall, offering a compromise between the two).</p>
      <p>These metrics should be calculated from the confusion matrix, using the standard formulas:</p>
      <p>•Accuracy = TP/(TP + FP)</p>
      <p>•Recall = TP/(TP + FN)</p>
      <p>•F1 = 2 × (accuracy × recall)/(accuracy + recall)</p>
    </sec>
    <sec id="sec2">
      <title>2. Methodology</title>
      <p>The developed CNN model comprises (<xref ref-type="fig" rid="fig1">Figure 1</xref> and<bold>Table 1</bold>):</p>
      <p>3 convolutional layers (32, 64, 128 filters, 3 × 3 kernels, ReLU activation),4 fully connected layers (750, 850, 850, 750 nodes, ReLU activation),0.5 dropout after each dense layer,3-node output layer, Softmax function.</p>
      <p>The input image is 48 × 48-pixel grayscale. Normalization and face preprocessing ensure homogeneous input. Optimization is performed via SGD, with a learning rate of 0.01 and a cross-entropy loss function. Callbacks such as EarlyStopping, ReduceLROnPlateau, and ModelCheckpoint were used.</p>
      <fig id="fig1">
        <label>Figure 1</label>
        <graphic xlink:href="https://html.scirp.org/file/8104923-rId15.jpeg?20260423031309" />
      </fig>
      <p><bold>Figure 1</bold><bold>.</bold> The convolutional neural network.</p>
      <p><bold>Table 1</bold><bold>.</bold> System architecture.</p>
      <table-wrap id="tbl1">
        <label>Table 1</label>
        <table>
          <tbody>
            <tr>
              <td>Template contents</td>
              <td>Details</td>
            </tr>
            <tr>
              <td>First convolution layer</td>
              <td>32 filters, 3 × 3 size, ReLU, input size 48 × 48</td>
            </tr>
            <tr>
              <td>First layer of max pooling:</td>
              <td>size 2 × 2</td>
            </tr>
            <tr>
              <td>Second convolution layer:</td>
              <td>64 filters, 3 × 3 size, ReLU</td>
            </tr>
            <tr>
              <td>Second layer of max pooling:</td>
              <td>size 2 × 2</td>
            </tr>
            <tr>
              <td>Third convolution layer:</td>
              <td>128 3 × 3 size filters, ReLU</td>
            </tr>
            <tr>
              <td>Third layer of max pooling:</td>
              <td>size 2 × 2</td>
            </tr>
            <tr>
              <td>First fully connected layer:</td>
              <td>750 knots, ReLU</td>
            </tr>
            <tr>
              <td>Dropout layer:</td>
              <td>random exclusion of 50% of neurons</td>
            </tr>
            <tr>
              <td>Second fully connected layer:</td>
              <td>850 knots, ReLU</td>
            </tr>
            <tr>
              <td>Dropout layer:</td>
              <td>random exclusion of 50% of neurons</td>
            </tr>
            <tr>
              <td>Third fully connected layer:</td>
              <td>850 knots, ReLU</td>
            </tr>
            <tr>
              <td>Dropout layer:</td>
              <td>random exclusion of 50% of neurons</td>
            </tr>
            <tr>
              <td>Fourth layer fully connected:</td>
              <td>750 knots, ReLU</td>
            </tr>
            <tr>
              <td>Dropout layer:</td>
              <td>random exclusion of 50% of neurons</td>
            </tr>
            <tr>
              <td>Output layer:</td>
              <td>3 nodes for 3 classes, SoftMax</td>
            </tr>
            <tr>
              <td>Optimization function:</td>
              <td>stochastic gradient descent (SGD)</td>
            </tr>
            <tr>
              <td>Learning rate:</td>
              <td>0.01</td>
            </tr>
            <tr>
              <td>Callback functions:</td>
              <td>EarlyStopping, ReduceLROnPlateau, ModelCheckpoint, TensorBoard</td>
            </tr>
          </tbody>
        </table>
      </table-wrap>
    </sec>
    <sec id="sec3">
      <title>3. Data Collection and Preprocessing</title>
      <p>To ensure the robustness and generalizability of the model, we used the FER2013 dataset, widely recognized in the facial recognition community. This dataset was selected for its wide diversity of expressions, angles, and lighting conditions.</p>
      <p>Unlike classic multi-dataset approaches, we chose to focus the model on three emotions: surprise (training data: 1171 images; test data: 831 images), neutrality (training data: 4965 images; test data: 1233 images) and joy (training data: 7215 images; test data: 1774 images), in order to simplify the classification task and improve real-time accuracy. Examples of images used for training are shown in <xref ref-type="fig" rid="fig2">Figure 2</xref>.</p>
      <fig id="fig2">
        <label>Figure 2</label>
        <graphic xlink:href="https://html.scirp.org/file/8104923-rId16.jpeg?20260423031309" />
      </fig>
      <p><bold>Figure 2</bold><bold>.</bold> Examples from the dataset.</p>
      <p>Preprocessing proceeds as follows:</p>
      <p><bold>1)</bold><bold>Face</bold><bold>detection</bold><bold>and</bold><bold>cropping</bold></p>
      <p>Facial registration involves locating the face in each image to eliminate background noise. Detection was performed using the OpenCV cascade classifier. Once detected, the face is cropped to reduce spatial complexity and facilitate CNN model training.</p>
      <p><bold>2)</bold><bold>Grayscale</bold><bold>Conversion</bold></p>
      <p>All images were resized to 48 × 48 pixels and then converted to grayscale (one channel) to reduce computational complexity and accelerate network training. This conversion simplifies data representation while preserving essential information about facial expressions.</p>
      <p><bold>3)</bold><bold>Image</bold><bold>Normalization</bold></p>
      <p>Pixel values were normalized to range from 0 to 1. Normalization improves learning convergence and stabilizes model training by harmonizing image intensity values.</p>
      <p><bold>4)</bold><bold>Data</bold><bold>Augmentation</bold></p>
      <p>To enhance the CNN’s generalization capabilities and address the limited number of examples per class, we applied data augmentation using the Keras ImageDataGenerator API. This technique generates new images from existing ones by applying random transformations, including: 1) rotations, 2) horizontal and vertical translations, 3) shearing, 4) random zooms, and 5) horizontal flips.</p>
      <p>This augmentation allows for the creation of a richer dataset, reducing overfitting and improving the model’s robustness to variations in pose or lighting conditions, which is crucial for real-time recognition (<xref ref-type="fig" rid="fig3">Figure 3</xref>).</p>
      <fig id="fig3">
        <label>Figure 3</label>
        <graphic xlink:href="https://html.scirp.org/file/8104923-rId17.jpeg?20260423031309" />
      </fig>
      <p><bold>Figure 3</bold><bold>.</bold> Data preprocessing.</p>
    </sec>
    <sec id="sec4">
      <title>4. Experimenting with Data Augmentation</title>
      <p>To improve the robustness and generalizability of the CNN model, we applied data augmentation techniques using the Keras ImageDataGenerator API. This function generates new images from the existing dataset by applying random transformations, such as: 1) Rotation around the image center (±15˚); 2) Shearing to simulate pose variations; 3) Random zoom to represent faces that are closer or farther away; 4) Horizontal flipping; 5) Horizontal and vertical shifting.</p>
      <p>Before augmentation, the FER2013 dataset used for the three targeted emotions comprised 12,040 images, or approximately 4013 images per class (surprise, neutral, happy). Since CNNs are highly data-dependent models, the dataset was enriched using the aforementioned transformations. After the increase, the dataset grew to 36,120 images, or approximately 12,040 images per class, significantly expanding the variety and coverage of use cases.</p>
      <p>This increased data is particularly important for facial expression recognition because it:</p>
      <p>1) Allows the model to better learn the natural variations of faces (pose, lighting, orientation).</p>
      <p>2) Reduces the risk of overfitting on the original images.</p>
      <p>3) Improves real-time accuracy and stability, essential for interactive applications.</p>
      <p>For training, 80% of the images were used for training and 20% for validation. To test the model’s robustness under more challenging conditions, a second scenario was tested: 65% of the images for training and 35% for testing (<xref ref-type="fig" rid="fig4">Figure 4</xref>). This allows us to assess the model’s ability to generalize to a larger dataset not seen during training.</p>
      <p>This approach has made it possible to verify that the CNN model optimized for three emotions maintains stable performance, even when the proportion of test data increases, while maintaining the prediction speed necessary for real-time processing.</p>
      <fig id="fig4">
        <label>Figure 4</label>
        <graphic xlink:href="https://html.scirp.org/file/8104923-rId18.jpeg?20260423031310" />
      </fig>
      <p><bold>Figure 4</bold><bold>.</bold> Data augmentation.</p>
    </sec>
    <sec id="sec5">
      <title>5. System Implementation</title>
      <p>The system was implemented in Python, using the Spyder IDE. The main libraries used are TensorFlow, Keras, NumPy, OpenCV, PIL, and Matplotlib. TensorFlow handles the neural network execution and manages CPU/GPU-optimized computational operations. Keras provides built-in functions for creating CNN layers, activation functions, optimizers, and training management. OpenCV is used for image preprocessing, including face detection via the cascade classifier, cropping, grayscale conversion, and normalization. ImageDataGenerator (Keras) manages data augmentation to enrich the dataset and improve generalization. Matplotlib is used to visualize the results, including confusion matrices and performance curves.</p>
      <p>It is important to explicitly distinguish the preprocessing applied to the images from the FER2013 dataset from that used in the deployed web interface. In the case of FER2013, the images are already faces that have been detected, aligned, converted to grayscale, and resized to 48 × 48 pixels. Therefore, preprocessing during training is generally limited to normalizing the intensities (e.g., scaling pixels between 0 and 1); possibly standardization; and applying data augmentation techniques (rotations, translations, zooms, etc.). In contrast, in the web interface, the input images come from real streams (camera or uploaded images) and require a more complete pipeline including: 1) face detection via OpenCV (for example with a Haar Cascade type classifier), 2) extraction of the region of interest (ROI), 3) conversion to greyscale, 4) resizing to 48 × 48 pixels, 5) then normalization identical to that used in training.</p>
      <p>To make the system accessible to end users, a web-based graphical user interface (GUI) was developed using HTML, CSS, and JavaScript, and connected to the model via a Flask server (Python). The user can select a local image and then click “Predict” to display the detected class. When an image is provided: 1) It is resized to 48 × 48 pixels to match the CNN input. 2) The OpenCV cascade classifier detects the facial region. The face is cropped to isolate the region of interest and remove the background. 3) The image is converted to grayscale, as the model was trained on a single channel. 4) Normalization is applied to harmonize the pixel values between 0 and 1. 5) The preprocessed image is then sent to the custom CNN, which predicts the emotion class from among surprise, neutral, and happy. This implementation ensures real-time compatibility, stable prediction for novel images, and the ability to be integrated into interactive systems or web applications. <xref ref-type="fig" rid="fig5">Figure 5</xref> shows several screenshots of the interface, demonstrating the model’s ability to accurately detect and classify images from diverse sources.</p>
      <fig id="fig5">
        <label>Figure 5</label>
        <graphic xlink:href="https://html.scirp.org/file/8104923-rId19.jpeg?20260423031310" />
      </fig>
      <p><bold>Figure 5</bold><bold>.</bold> Real-time validation.</p>
    </sec>
    <sec id="sec6">
      <title>6. Results and Discussion</title>
      <p>The proposed model, a convolutional neural network (CNN) with data augmentation, was evaluated on the FER2013 dataset, limited to the three targeted emotions: surprise, neutral, and happy. Training was performed with a split ratio of 80% for training and 20% for validation.</p>
      <p>To ensure the reproducibility of the results, it is necessary to specify the main hyperparameters and training conditions. In particular, the model is trained with a batch size of 15351 images, a stochastic gradient descent (SGD) optimizer with a momentum parameter set to m (m = 0.9), and an initial learning rate of <italic>η</italic> (<italic>η</italic> = 0.01), adjusted according to a planning strategy (e.g., a reduction by a factor of <italic>γ</italic> (<italic>γ</italic> = 0.5) after p epochs without improvement in the validation loss). An early stopping mechanism is used with a patience of k (k = 10) epochs (<italic>i.e.</italic>, training is stopped if the validation metric does not improve for k consecutive epochs, according to a rule based on minimizing loss or maximizing accuracy). The experiments are initialized with a fixed random seed (s = 42) to ensure the reproducibility of the data partitions and weight initialization. Finally, the hardware used for training is specified (e.g., CPU or GPU, with the exact model), and the total training time or the number of epochs performed is indicated.</p>
      <p>After 50 epochs, the model achieved a validation accuracy of 89%, demonstrating the effectiveness of the CNN architecture combined with increased data. Accuracy remained high and nearly constant for all three classes, even when images exhibited slight geometric distortions (pose or facial tilt). The evolution of accuracy over epochs was tracked using TensorBoard (<xref ref-type="fig" rid="fig6">Figure 6</xref>), where the x-axis represents the number of epochs and the y-axis the recognition rate.</p>
      <p>The impact of data augmentation was clearly observed. The augmented model reached 89% accuracy in just 50 epochs. The applied transformations (rotation, translation, shear, zoom, and flip) introduced relevant variations into the dataset, improving the model’s ability to generalize to new images. Dropout (0.5) and early stopping mechanisms were used to limit overfitting and ensure stable convergence.</p>
      <p>To test the model’s robustness, a second scenario was evaluated with a 65/35 split ratio (65% training, 35% testing). In this case, the model achieved 87% accuracy, slightly lower than the 80/20 split but still very satisfactory considering the larger volume of test data. This demonstrates that the model maintains stable performance even when the number of unseen examples increases, which is crucial for real-time applications.</p>
      <fig id="fig6">
        <label>Figure 6</label>
        <graphic xlink:href="https://html.scirp.org/file/8104923-rId20.jpeg?20260423031310" />
      </fig>
      <p><bold>Figure 6</bold><bold>.</bold> Training and validation curve.</p>
      <p>Finally, the analysis of the results highlights that the model is able to classify the three emotions with a high and balanced recognition rate; process slightly distorted or rotated faces and work effectively on novel images thanks to the combination of data augmentation, dropout and normalization.</p>
      <p>These results confirm that simplifying the problem (3 classes) and optimizing the CNN for real-time use makes it possible to achieve high accuracy, while maintaining low latency suitable for interactive or embedded systems (See <xref ref-type="fig" rid="fig7">Figure 7</xref> below).</p>
      <fig id="fig7">
        <label>Figure 7</label>
        <graphic xlink:href="https://html.scirp.org/file/8104923-rId21.jpeg?20260423031310" />
      </fig>
      <p><bold>Figure 7</bold><bold>.</bold> Confusion matrix.</p>
    </sec>
    <sec id="sec7">
      <title>7. Conclusion and Future Outlook</title>
      <p>In this research, we developed a real-time facial expression recognition system based on a data-augmented, optimized convolutional neural network (CNN). Focusing on three key emotions—surprise, neutral, and happy—the model achieves 89% validation accuracy on the FER2013 dataset. The results show that data augmentation, combined with regularization techniques such as dropout and early stopping, maintains a high and balanced recognition rate for each class. Furthermore, the model is capable of classifying images with slight geometric distortions, making it suitable for interactive, real-time applications.</p>
      <p>Experiments conducted with different data splitting ratios (80/20 and 65/35) demonstrate that the model maintains robust performance even as the test data volume increases, confirming the stability and generalizability of the proposed CNN. These results are consistent with recent observations in the facial recognition literature and the importance of increasing data for limited datasets.</p>
      <p>Several research avenues can be explored for the future: 1) Extension to more emotion classes: While focusing on three emotions has improved accuracy and stability, including other emotions such as sadness or fear could enrich the system’s applicability, particularly in contexts of emotional monitoring or social interaction. 2) Optimization for real-time video streams: The current implementation has been validated on static images. Adapting it to video streams with continuous face detection and tracking would allow for the evaluation of performance over time sequences and the handling of emotional transitions. 3) Integration of advanced deep network techniques: The use of newer models, such as Residual CNNs (ResNet) or Transformers for vision, could improve the system’s ability to capture subtle facial features while maintaining real-time performance. 4) Deployment on embedded and mobile platforms: Simplifying the model and optimizing the real-time CNN pave the way for integration on embedded devices, such as social robots or mobile applications, where latency and power consumption are critical.</p>
      <p>In conclusion, this work demonstrates that combining a simple yet effective CNN, targeted data augmentation, and appropriate preprocessing enables the design of a robust and fast facial recognition system for interactive applications. This approach provides a solid foundation for future extensions to more comprehensive systems and dynamic environments.</p>
    </sec>
    <sec id="sec8">
      <title>Declaration</title>
      <p>The platform used to create or produce the portraits and images in this article is provided via the links: <ext-link ext-link-type="uri" xlink:href="https://mediacy.com/blog/ai-essentials-cnns-microscopy/">https://mediacy.com/blog/ai-essentials-cnns-microscopy/</ext-link>, <ext-link ext-link-type="uri" xlink:href="https://fr.freepik.com/photos/personne-triste">https://fr.freepik.com/photos/personne-triste</ext-link>, <ext-link ext-link-type="uri" xlink:href="https://www.google.com/search?sca_esv=2a704d97ad930897&amp;sxsrf=ANbL-n4Eq6k_Jp2mFwIoTm9EYUhYdWAoAg:1776756796080&amp;udm=2&amp;fbs=ADc_l-bpk8W4E-qsVlOvbGJcDwpnHC5OJXXTJvmMu2n9YYx-G8xzgQk24aW1N_FyIND5zVDd4bb14119C8nZHL5l4Fe3Q78DM888EmtVm1l7Ggrb1XBl29I-upxH2ZKiusq_Iw2q9oUHOoAZBYuy8EaAcNGNbMYqqXay6V_L7kQfc6l4SAS5l_Dqujgh0OOfmu5n67ZfjXhn9IJG0UpXFHUvFBgsdhB5UQ&amp;q=les+diff%C3%A9rents+%C3%A9motions+sur+les+visages&amp;sa=X&amp;ved=2ahUKEwi25Lrttv6TAxWyWUEAHXXaARoQtKgLegQIFhAB&amp;biw=1920&amp;bih=919&amp;dpr=1">https://www.google.com/search?sca_esv=2a704d97ad930897&amp;sxsrf=ANbL-n4Eq6k_Jp2mFwIoTm9EYUhYdWAoAg:1776756796080&amp;udm=2&amp;fbs=ADc_l-bpk8W4E-qsVlOvbGJcDwpnHC5OJXXTJvmMu2n9YYx-G8xzgQk24aW1N_FyIND5zVDd4bb14119C8nZHL5l4Fe3Q78DM888EmtVm1l7Ggrb1XBl29I-upxH2ZKiusq_Iw2q9oUHOoAZBYuy8EaAcNGNbMYqqXay6V_L7kQfc6l4SAS5l_Dqujgh0OOfmu5n67ZfjXhn9IJG0UpXFHUvFBgsdhB5UQ&amp;q=les+diff%C3%A9rents+%C3%A9motions+sur+les+visages&amp;sa=X&amp;ved=2ahUKEwi25Lrttv6TAxWyWUEAHXXaARoQtKgLegQIFhAB&amp;biw=1920&amp;bih=919&amp;dpr=1</ext-link>.</p>
    </sec>
  </body>
  <back>
    <ref-list>
      <title>References</title>
      <ref id="B1">
        <label>1.</label>
        <citation-alternatives>
          <mixed-citation publication-type="other">Li, S. and Deng, W. (2022) Deep Facial Expression Recognition: A Survey. <italic>IEEE</italic><italic>Transactions</italic><italic>on</italic><italic>Affective</italic><italic>Computing</italic>, 13, 1195-1215. https://doi.org/10.1109/taffc.2020.2981446 <pub-id pub-id-type="doi">10.1109/taffc.2020.2981446</pub-id><ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1109/taffc.2020.2981446">https://doi.org/10.1109/taffc.2020.2981446</ext-link></mixed-citation>
          <element-citation publication-type="other">
            <person-group person-group-type="author">
              <string-name>Li, S.</string-name>
              <string-name>Deng, W.</string-name>
            </person-group>
            <year>2022</year>
            <article-title>Deep Facial Expression Recognition: A Survey</article-title>
            <source>IEEE Transactions on Affective Computing</source>
            <volume>13</volume>
            <pub-id pub-id-type="doi">10.1109/taffc.2020.2981446</pub-id>
          </element-citation>
        </citation-alternatives>
      </ref>
      <ref id="B2">
        <label>2.</label>
        <citation-alternatives>
          <mixed-citation publication-type="report">Goodfellow, I.J., Erhan, D., Luc Carrier, P., Courville, A., Mirza, M., Hamner, B., <italic>et al</italic>. (2015) Challenges in Representation Learning: A Report on Three Machine Learning Contests. <italic>Neural</italic><italic>Networks</italic>, 64, 59-63. https://doi.org/10.1016/j.neunet.2014.09.005 <pub-id pub-id-type="doi">10.1016/j.neunet.2014.09.005</pub-id><pub-id pub-id-type="pmid">25613956</pub-id><ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1016/j.neunet.2014.09.005">https://doi.org/10.1016/j.neunet.2014.09.005</ext-link></mixed-citation>
          <element-citation publication-type="report">
            <person-group person-group-type="author">
              <string-name>Goodfellow, I.J.</string-name>
              <string-name>Erhan, D.</string-name>
              <string-name>Carrier, P.</string-name>
              <string-name>Courville, A.</string-name>
              <string-name>Mirza, M.</string-name>
              <string-name>Hamner, B.</string-name>
            </person-group>
            <year>2015</year>
            <article-title>Challenges in Representation Learning: A Report on Three Machine Learning Contests</article-title>
            <source>Neural Networks</source>
            <volume>64</volume>
            <pub-id pub-id-type="doi">10.1016/j.neunet.2014.09.005</pub-id>
            <pub-id pub-id-type="pmid">25613956</pub-id>
          </element-citation>
        </citation-alternatives>
      </ref>
      <ref id="B3">
        <label>3.</label>
        <citation-alternatives>
          <mixed-citation publication-type="journal">Echoukairi, H., El Ghmary, M., Ziani, S. and Ouacha, A. (2023) Improved Methods for Automatic Facial Expression Recognition. <italic>International</italic><italic>Journal</italic><italic>of</italic><italic>Interactive</italic><italic>Mobile</italic><italic>Technologies</italic> ( <italic>iJIM</italic>), 17, 33-44. https://doi.org/10.3991/ijim.v17i06.37031 <pub-id pub-id-type="doi">10.3991/ijim.v17i06.37031</pub-id><ext-link ext-link-type="uri" xlink:href="https://doi.org/10.3991/ijim.v17i06.37031">https://doi.org/10.3991/ijim.v17i06.37031</ext-link></mixed-citation>
          <element-citation publication-type="journal">
            <person-group person-group-type="author">
              <string-name>Echoukairi, H.</string-name>
              <string-name>Ghmary, M.</string-name>
              <string-name>Ziani, S.</string-name>
              <string-name>Ouacha, A.</string-name>
            </person-group>
            <year>2023</year>
            <article-title>Improved Methods for Automatic Facial Expression Recognition</article-title>
            <source>International Journal of Interactive Mobile Technologies (iJIM)</source>
            <volume>17</volume>
            <pub-id pub-id-type="doi">10.3991/ijim.v17i06.37031</pub-id>
          </element-citation>
        </citation-alternatives>
      </ref>
      <ref id="B4">
        <label>4.</label>
        <citation-alternatives>
          <mixed-citation publication-type="other">Guo, A. (2025) Enhancing Facial Expression Recognition with Robust CNN Architectures and Adaptive Preprocessing Techniques. <italic>Applied</italic><italic>and</italic><italic>Computational</italic><italic>Engineering</italic>, 100, 137-145. https://doi.org/10.54254/2755-2721/2025.20426 <pub-id pub-id-type="doi">10.54254/2755-2721/2025.20426</pub-id><ext-link ext-link-type="uri" xlink:href="https://doi.org/10.54254/2755-2721/2025.20426">https://doi.org/10.54254/2755-2721/2025.20426</ext-link></mixed-citation>
          <element-citation publication-type="other">
            <person-group person-group-type="author">
              <string-name>Guo, A.</string-name>
            </person-group>
            <year>2025</year>
            <article-title>Enhancing Facial Expression Recognition with Robust CNN Architectures and Adaptive Preprocessing Techniques</article-title>
            <source>Applied and Computational Engineering</source>
            <volume>100</volume>
            <pub-id pub-id-type="doi">10.54254/2755-2721/2025.20426</pub-id>
          </element-citation>
        </citation-alternatives>
      </ref>
      <ref id="B5">
        <label>5.</label>
        <citation-alternatives>
          <mixed-citation publication-type="journal">Ajitha, V. (2024) CNN-Driven Enhancement in Facial Emotion Recognition Systems. <italic>International Journal of Intelligent Systems and Applications in Engineering</italic>, 12, 2343-2350. https://ijisae.org/index.php/IJISAE/article/view/6620</mixed-citation>
          <element-citation publication-type="journal">
            <person-group person-group-type="author">
              <string-name>Ajitha, V.</string-name>
            </person-group>
            <year>2024</year>
            <article-title>CNN-Driven Enhancement in Facial Emotion Recognition Systems</article-title>
            <source>International Journal of Intelligent Systems and Applications in Engineering</source>
            <volume>12</volume>
          </element-citation>
        </citation-alternatives>
      </ref>
      <ref id="B6">
        <label>6.</label>
        <citation-alternatives>
          <mixed-citation publication-type="other">Liu, Y. (2023) The Study of Performance Related to Classical Convolutional Neural Networks in the Field of Facial Emotion Recognition. <italic>Applied</italic><italic>and</italic><italic>Computational</italic><italic>Engineering</italic>, 8, 470-474. https://doi.org/10.54254/2755-2721/8/20230248 <pub-id pub-id-type="doi">10.54254/2755-2721/8/20230248</pub-id><ext-link ext-link-type="uri" xlink:href="https://doi.org/10.54254/2755-2721/8/20230248">https://doi.org/10.54254/2755-2721/8/20230248</ext-link></mixed-citation>
          <element-citation publication-type="other">
            <person-group person-group-type="author">
              <string-name>Liu, Y.</string-name>
            </person-group>
            <year>2023</year>
            <article-title>The Study of Performance Related to Classical Convolutional Neural Networks in the Field of Facial Emotion Recognition</article-title>
            <source>Applied and Computational Engineering</source>
            <volume>8</volume>
            <pub-id pub-id-type="doi">10.54254/2755-2721/8/20230248</pub-id>
          </element-citation>
        </citation-alternatives>
      </ref>
      <ref id="B7">
        <label>7.</label>
        <citation-alternatives>
          <mixed-citation publication-type="journal">Xie, Y., Tian, W. and Yu, Z. (2023) Robust Facial Expression Recognition with Transformer Block Enhancement Module. <italic>Engineering</italic><italic>Applications</italic><italic>of</italic><italic>Artificial</italic><italic>Intelligence</italic>, 126, Article ID: 106795. https://doi.org/10.1016/j.engappai.2023.106795 <pub-id pub-id-type="doi">10.1016/j.engappai.2023.106795</pub-id><ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1016/j.engappai.2023.106795">https://doi.org/10.1016/j.engappai.2023.106795</ext-link></mixed-citation>
          <element-citation publication-type="journal">
            <person-group person-group-type="author">
              <string-name>Xie, Y.</string-name>
              <string-name>Tian, W.</string-name>
              <string-name>Yu, Z.</string-name>
            </person-group>
            <year>2023</year>
            <article-title>Robust Facial Expression Recognition with Transformer Block Enhancement Module</article-title>
            <source>Engineering Applications of Artificial Intelligence</source>
            <volume>126</volume>
            <fpage>106795</fpage>
            <elocation-id>ID</elocation-id>
            <pub-id pub-id-type="doi">10.1016/j.engappai.2023.106795</pub-id>
          </element-citation>
        </citation-alternatives>
      </ref>
      <ref id="B8">
        <label>8.</label>
        <citation-alternatives>
          <mixed-citation publication-type="other">Goodfellow, I., <italic>et al</italic>. (2013) Challenges in Representation Learning: Facial Expression Recognition Dataset (FER2013). <italic>Neural Networks</italic>, 27, 45-55.</mixed-citation>
          <element-citation publication-type="other">
            <person-group person-group-type="author">
              <string-name>Goodfellow, I.</string-name>
            </person-group>
            <year>2013</year>
            <article-title>Challenges in Representation Learning: Facial Expression Recognition Dataset (FER2013)</article-title>
            <source>Neural Networks</source>
            <volume>27</volume>
          </element-citation>
        </citation-alternatives>
      </ref>
    </ref-list>
  </back>
</article>