<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.3 20210610//EN" "JATS-journalpublishing1-3.dtd">
<article article-type="research-article" dtd-version="1.3" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xml:lang="ru"><front><journal-meta><journal-id journal-id-type="publisher-id">izvestswsu</journal-id><journal-title-group><journal-title xml:lang="ru">Известия Юго-Западного государственного университета</journal-title><trans-title-group xml:lang="en"><trans-title>Proceedings of the Southwest State University</trans-title></trans-title-group></journal-title-group><issn pub-type="ppub">2223-1560</issn><issn pub-type="epub">2686-6757</issn><publisher><publisher-name>ЮЗГУ</publisher-name></publisher></journal-meta><article-meta><article-id pub-id-type="doi">10.21869/2223-1560-2021-25-1-82-109</article-id><article-id custom-type="elpub" pub-id-type="custom">izvestswsu-869</article-id><article-categories><subj-group subj-group-type="heading"><subject>Research Article</subject></subj-group><subj-group subj-group-type="section-heading" xml:lang="ru"><subject>Информатика, вычислительная техника и управление</subject></subj-group><subj-group subj-group-type="section-heading" xml:lang="en"><subject>Computer science, computer engineering and IT managment</subject></subj-group></article-categories><title-group><article-title>Применение многозадачного глубокого обучения в задаче распознавания эмоций в речи</article-title><trans-title-group xml:lang="en"><trans-title>Applying Multitask Deep Learning to Emotion Recognition in Speech</trans-title></trans-title-group></title-group><contrib-group><contrib contrib-type="author" corresp="yes"><contrib-id contrib-id-type="orcid">https://orcid.org/0000-0002-3572-4493</contrib-id><name-alternatives><name name-style="eastern" xml:lang="ru"><surname>Рябинов</surname><given-names>А. В.</given-names></name><name name-style="western" xml:lang="en"><surname>Ryabinov</surname><given-names>A. V.</given-names></name></name-alternatives><bio xml:lang="ru"><p>Рябинов Артем Валерьевич, программист лаборатории автономных робототехнических систем, Санкт-Петербургский Федеральный исследовательский центр Российской академии наук (СПб ФИЦ РАН), Санкт-Петербургский институт информатики и автоматизации Российской академии наук</p><p>14-я линия В. О. 39, Санкт-Петербург 199178</p></bio><bio xml:lang="en"><p>Artem V. Ryabinov, Software Engineer of Laboratory of Autonomous Robotic Systems, St. Petersburg Federal Research Center of the Russian Academy of Sciences (SPC RAS), St. Petersburg Institute for Informatics and Automation of the Russian Academy of Sciences</p><p>39, 14th Line, St. Petersburg 199178</p></bio><email xlink:type="simple">iamryabinov@gmail.com</email><xref ref-type="aff" rid="aff-1"/></contrib><contrib contrib-type="author" corresp="yes"><contrib-id contrib-id-type="orcid">https://orcid.org/0000-0002-7032-0291</contrib-id><name-alternatives><name name-style="eastern" xml:lang="ru"><surname>Уздяев</surname><given-names>М. Ю.</given-names></name><name name-style="western" xml:lang="en"><surname>Uzdiaev</surname><given-names>M. Yu.</given-names></name></name-alternatives><bio xml:lang="ru"><p>Уздяев Михаил Юрьевич, младший научный сотрудник лаборатории технологий больших данных социокиберфизических систем, Санкт-Петербургский Федеральный исследовательский центр Российской академии наук (СПб ФИЦ РАН), Санкт-Петербургский институт информатики и автоматизации Российской академии наук</p><p>14-я линия В. О. 39, Санкт-Петербург 199178</p></bio><bio xml:lang="en"><p>Mikhail Yu. Uzdiaev, Junior Researcher of Laboratory of Big Data In Socio-Cyberphysical Systems, St. Petersburg Federal Research Center of the Russian Academy of Sciences (SPC RAS), St. Petersburg Institute for Informatics and Automation of the Russian Academy of Sciences</p><p>39, 14th Line, St. Petersburg 199178</p></bio><email xlink:type="simple">uzdyaev.m@iias.spb.su</email><xref ref-type="aff" rid="aff-1"/></contrib><contrib contrib-type="author" corresp="yes"><contrib-id contrib-id-type="orcid">https://orcid.org/0000-0001-5388-8152</contrib-id><name-alternatives><name name-style="eastern" xml:lang="ru"><surname>Ватаманюк</surname><given-names>И. В.</given-names></name><name name-style="western" xml:lang="en"><surname>Vatamaniuk</surname><given-names>I. V.</given-names></name></name-alternatives><bio xml:lang="ru"><p>Ватаманюк Ирина Валерьевна, младший научный сотрудник лаборатории автономных робототехнических систем, СанктПетербургский Федеральный исследовательский центр Российской академии наук (СПб ФИЦ РАН), Санкт-Петербургский институт информатики и автоматизации Российской академии наук</p><p>14-я линия В. О. 39, Санкт-Петербург 199178</p></bio><bio xml:lang="en"><p>Irina V. Vatamaniuk, Junior Researcher of Laboratory of Autonomous Robotic Systems, St. Petersburg Federal Research Center of the Russian Academy of Sciences (SPC RAS), St. Petersburg Institute for Informatics and Automation of the Russian Academy of Sciences</p><p>39, 14th Line, St. Petersburg 199178</p></bio><email xlink:type="simple">vatamaniuk.i.v@gmail.com</email><xref ref-type="aff" rid="aff-1"/></contrib></contrib-group><aff-alternatives id="aff-1"><aff xml:lang="ru"><institution>Санкт-Петербургский Федеральный исследовательский центр Российской академии наук; Санкт-Петербургский институт информатики и автоматизации Российской академии наук</institution></aff><aff xml:lang="en"><institution>St. Petersburg Federal Research Center of the Russian Academy of Sciences (SPC RAS); St. Petersburg Institute for Informatics and Automation of the Russian Academy of Sciences</institution></aff></aff-alternatives><pub-date pub-type="collection"><year>2021</year></pub-date><pub-date pub-type="epub"><day>30</day><month>05</month><year>2021</year></pub-date><volume>25</volume><issue>1</issue><fpage>82</fpage><lpage>109</lpage><permissions><copyright-statement>Copyright &amp;#x00A9; Рябинов А.В., Уздяев М.Ю., Ватаманюк И.В., 2021</copyright-statement><copyright-year>2021</copyright-year><copyright-holder xml:lang="ru">Рябинов А.В., Уздяев М.Ю., Ватаманюк И.В.</copyright-holder><copyright-holder xml:lang="en">Ryabinov A.V., Uzdiaev M.Y., Vatamaniuk I.V.</copyright-holder><license xml:lang="ru" license-type="creative-commons-attribution" xlink:href="https://creativecommons.org/licenses/by/4.0/" xlink:type="simple"><license-p>Данная работа распространяется под лицензией Creative Commons Attribution 4.0.</license-p></license><license xml:lang="en" license-type="creative-commons-attribution" xlink:href="https://creativecommons.org/licenses/by/4.0/" xlink:type="simple"><license-p>This work is licensed under a Creative Commons Attribution 4.0 License.</license-p></license></permissions><self-uri xlink:href="https://izvestswsu.elpub.ru/jour/article/view/869">https://izvestswsu.elpub.ru/jour/article/view/869</self-uri><abstract><sec><title>Цель исследования</title><p>Цель исследования. Эмоции играют одну из ключевых ролей в регуляции поведения человека. Решение задачи автоматического распознавания эмоций позволяет повысить эффективность функционирования целого ряда цифровых систем: систем обеспечения безопасности, человеко-машинных интерфейсов, систем электронной коммерции и т.д. При этом отмечается низкая эффективность современных подходов распознавания эмоций в речи. Данная работа посвящена исследованию автоматического распознавания эмоций в речи с помощью методов машинного обучения.</p></sec><sec><title>Методы</title><p>Методы. В статье описан и протестирован подход к автоматическому распознаванию эмоций в речи на основе многозадачного обучения глубоких сверточных нейронных сетей архитектур AlexNet и VGG с применением автоматического подбора коэффициентов весов каждой задачи при вычислении итогового значения потери в процессе обучения. Все модели были обучены на выборке набора данных IEMOCAP с четырьмя эмоциональными категориями «гнев», «счастье», «нейтральная эмоция», «грусть». В качестве входных данных используются обработанные специализированным алгоритмом лог-мел спектрограммы высказываний.</p></sec><sec><title>Результаты</title><p>Результаты. Рассмотренные модели были протестированы на основе численных метрик: доля верно распознанных экземпляров, точность, полнота, f-мера. По всем вышеперечисленным метрикам получено улучшение качества распознавания эмоций предлагаемой моделью по сравнению с двумя базовыми однозадачными моделями, а также с известными решениями. Это достигается благодаря применению автоматического взвешивания значений функций потерь от отдельных задач при формировании итогового значения ошибки в процессе обучения.</p></sec><sec><title>Заключение</title><p>Заключение. Полученное улучшение качества распознавания эмоций по сравнению с известными решениями подтверждает целесообразность применения концепции многозадачного обучения для увеличения точности моделей распознавания эмоций. Разработанный подход позволяет достичь равномерного и одновременного снижения ошибок отдельных задач и используется в области распознавания эмоций в речи впервые.</p></sec></abstract><trans-abstract xml:lang="en"><sec><title>Purpose of research</title><p>Purpose of research. Emotions play one of the key roles in the regulation of human behaviour. Solving the problem of automatic recognition of emotions makes it possible to increase the effectiveness of operation of a whole range of digital systems such as security systems, human-machine interfaces, e-commerce systems, etc. At the same time, the low efficiency of modern approaches to recognizing emotions in speech can be noted. This work studies automatic recognition of emotions in speech applying machine learning methods.</p></sec><sec><title>Methods</title><p>Methods. The article describes and tests an approach to automatic recognition of emotions in speech based on multitask learning of deep convolution neural networks of AlexNet and VGG architectures using automatic selection of the weight coefficients for each task when calculating the final loss value during learning. All the models were trained on a sample of the IEMOCAP dataset with four emotional categories of ‘anger’, ‘happiness’, ‘neutral emotion’, ‘sadness’. The log-mel spectrograms of statements processed by a specialized algorithm are used as input data.</p></sec><sec><title>Results</title><p>Results. The considered models were tested on the basis of numerical metrics: the share of correctly recognized instances, accuracy, completeness, f-measure. For all of the above metrics, an improvement in the quality of emotion recognition by the proposed model was obtained in comparison with the two basic single-task models as well as with known solutions. This result is achieved through the use of automatic weighting of the values of the loss functions from individual tasks when forming the final value of the error in the learning process.</p></sec><sec><title>Conclusion</title><p>Conclusion. The resulting improvement in the quality of emotion recognition in comparison with the known solutions confirms the feasibility of applying multitask learning to increase the accuracy of emotion recognition models. The developed approach makes it possible to achieve a uniform and simultaneous reduction of errors of individual tasks, and is used in the field of emotions recognition in speech for the first time.</p></sec></trans-abstract><kwd-group xml:lang="ru"><kwd>многозадачное обучение</kwd><kwd>сверточные нейронные сети</kwd><kwd>речевые технологии</kwd><kwd>автоматическое распознавание эмоций</kwd><kwd>анализ аудиосигналов речи</kwd></kwd-group><kwd-group xml:lang="en"><kwd>multitask learning</kwd><kwd>convolution neural networks</kwd><kwd>speech technologies</kwd><kwd>automatic emotion recognition</kwd><kwd>analysis of audio signals of speech</kwd></kwd-group><funding-group><funding-statement xml:lang="ru">Работа выполнена при поддержке РФФИ (18-29-22061_мк).</funding-statement><funding-statement xml:lang="en">The work was supported by the Russian Foundation for Basic Research (18-29-22061_mk).</funding-statement></funding-group></article-meta></front><back><ref-list><title>References</title><ref id="cit1"><label>1</label><citation-alternatives><mixed-citation xml:lang="ru">Tokuno S., Tsumatori, G., Shono S., Takei E., Yamamoto T., Suzuki G., Mituyoshi S., Shimura M. Usage of emotion recognition in military health care // Defense Science Research Conference and Expo (DSR). IEEE, 2011, P. 1-5. https://doi.org/10.1109/DSR.2011.6026823</mixed-citation><mixed-citation xml:lang="en">Tokuno S., Tsumatori, G., Shono S., Takei E., Yamamoto T., Suzuki G., Mituyoshi S., Shimura M. Usage of emotion recognition in military health care. Defense Science Research Conference and Expo (DSR). IEEE, 2011:1-5. https://doi.org/10.1109/DSR.2011.6026823</mixed-citation></citation-alternatives></ref><ref id="cit2"><label>2</label><citation-alternatives><mixed-citation xml:lang="ru">Saste S.T., Jagdale S.M. Emotion recognition from speech using MFCC and DWT for security system // 2017 international conference of electronics, communication and aerospace technology (ICECA). IEEE, 2017. 1. P. 701-704. https://doi.org/10.1109/ICECA.2017.8203631</mixed-citation><mixed-citation xml:lang="en">Saste S.T., Jagdale S.M. Emotion recognition from speech using MFCC and DWT for security system. 2017 international conference of electronics, communication and aerospace technology (ICECA). IEEE, 2017; 1:701-704. https://doi.org/10.1109/ICECA.2017.8203631</mixed-citation></citation-alternatives></ref><ref id="cit3"><label>3</label><citation-alternatives><mixed-citation xml:lang="ru">Rázuri J.G., Sundgren D., Rahmani R., Moran A., Bonet I., Larsson A. Speech emotion recognition in emotional feedbackfor human-robot interaction // International Journal of Advanced Research in Artificial Intelligence (IJARAI). 2015. No. 4(2). P. 20¬27. https://doi.org/10.14569/IJARAI.2015.040204</mixed-citation><mixed-citation xml:lang="en">Rázuri J.G., Sundgren D., Rahmani R., Moran A., Bonet I., Larsson A. Speech emotion recognition in emotional feedbackfor human-robot interaction. International Journal of Advanced Research in Artificial Intelligence (IJARAI), 2015, 4(2), pp. 20¬27. https://doi.org/10.14569/IJARAI.2015.040204</mixed-citation></citation-alternatives></ref><ref id="cit4"><label>4</label><citation-alternatives><mixed-citation xml:lang="ru">Bojanić M., Delić V., Karpov A. Call redistribution for a call center based on speech emotion recognition // Applied Sciences. 2020. 10(13). P. 4653. https://doi.org/10.3390/app10134653</mixed-citation><mixed-citation xml:lang="en">Bojanić M., Delić V., Karpov A. Call redistribution for a call center based on speech emotion recognition. Applied Sciences, 2020, no. 10(13), pp. 46-53. https://doi.org/10.3390/app10134653</mixed-citation></citation-alternatives></ref><ref id="cit5"><label>5</label><citation-alternatives><mixed-citation xml:lang="ru">Björn W., Schuller L. Speech emotion recognition: two decades in a nutshell, benchmarks, and ongoing trends // Communications of the Acm. 2018. 61(5). P. 90¬99. https://doi.org/10.1145/3129340</mixed-citation><mixed-citation xml:lang="en">Björn W., Schuller L. Speech emotion recognition: two decades in a nutshell, benchmarks, and ongoing trends. Communications of the Acm, 2018, no. 61(5), pp.90¬99. https://doi.org/10.1145/3129340</mixed-citation></citation-alternatives></ref><ref id="cit6"><label>6</label><citation-alternatives><mixed-citation xml:lang="ru">Вилюнас В.К. Эмоции // Большой психологический словарь/под общ. ред. Б.Г. Мещерякова, В.П. Зинченко. URL: https://psychological.slovaronline.com/2078-EMOTSII</mixed-citation><mixed-citation xml:lang="en">Vilyunas V.K. [Emotions]. Bol'shoj psihologicheskij slovar' [Big psychological dictionary] /pod obshch. red. B.G. Meshcheryakova, V.P. Zinchenko (In Russ.). Available at: https://psychological.slovaronline.com/2078-EMOTSII</mixed-citation></citation-alternatives></ref><ref id="cit7"><label>7</label><citation-alternatives><mixed-citation xml:lang="ru">Ильин Е.П. Эмоции и чувства. СПб.: Издательский дом "Питер", 2011.</mixed-citation><mixed-citation xml:lang="en">Il'in E.P., Emocii i chuvstva [Emotions and feelings]. Saint-Petersburg, Piter Publ., 2011 (In Russ.)</mixed-citation></citation-alternatives></ref><ref id="cit8"><label>8</label><citation-alternatives><mixed-citation xml:lang="ru">Sailunaz K., Dhaliwal M., Rokne J., Alhajj R. Emotion detection from text and speech: a survey // Social Network Analysis and Mining. 2018. 8(1). P. 28. https://doi.org/10.1007/s13278-018-0505-2</mixed-citation><mixed-citation xml:lang="en">Sailunaz K., Dhaliwal M., Rokne J., Alhajj R. Emotion detection from text and speech: a survey. Social Network Analysis and Mining, 2018, no. 8(1), p. 28. https://doi.org/10.1007/s13278-018-0505-2</mixed-citation></citation-alternatives></ref><ref id="cit9"><label>9</label><citation-alternatives><mixed-citation xml:lang="ru">Ekman P. Facial expression and emotion // American psychologist. 1993. 48 (4). P. 384. https://doi.org/10.1037/0003-066X.48.4.384</mixed-citation><mixed-citation xml:lang="en">Ekman P. Facial expression and emotion. American psychologist, 1993. 48(4), 384 p. https://doi.org/10.1037/0003-066X.48.4.384</mixed-citation></citation-alternatives></ref><ref id="cit10"><label>10</label><citation-alternatives><mixed-citation xml:lang="ru">Russell J.A. Affective space is bipolar // Journal of personality and social psychology. 1979. 37 (3). P. 345. https://doi.org/10.1037/0022-3514.37.3.345</mixed-citation><mixed-citation xml:lang="en">Russell J.A. Affective space is bipolar. Journal of personality and social psychology, 1979, no. 37 (3), 345 p. https://doi.org/10.1037/0022-3514.37.3.345</mixed-citation></citation-alternatives></ref><ref id="cit11"><label>11</label><citation-alternatives><mixed-citation xml:lang="ru">Russell J.A. Culture and the categorization of emotions // Psychological bulletin. – 1991. 110 (3). P. 426. https://doi.org/10.1037/0033-2909.110.3.426</mixed-citation><mixed-citation xml:lang="en">Russell J.A. Culture and the categorization of emotions. Psychological bulletin, 1991, no. 110 (3), 426 p. https://doi.org/10.1037/0033-2909.110.3.426</mixed-citation></citation-alternatives></ref><ref id="cit12"><label>12</label><citation-alternatives><mixed-citation xml:lang="ru">Adieu features? End-to-end speech emotion recognition using a deep convolutional recurrent network / G. Trigeorgis, F. Ringeval, R. Brueckner, E. Marchi, M.A. Nicolaou, B. Schuller, S. Zafeiriou // 2016 IEEE international conference on acoustics, speech and signal processing (ICASSP). IEEE, 2016. P. 5200-5204. https://doi.org/10.1109/ICASSP.2016.7472669</mixed-citation><mixed-citation xml:lang="en">Trigeorgis G., Ringeval F., Brueckner R., Marchi E., Nicolaou M.A., Schuller B., Zafeiriou S. Adieu features? End-to-end speech emotion recognition using a deep convolutional recurrent network. 2016 IEEE international conference on acoustics, speech and signal processing (ICASSP). IEEE, 2016:5200-5204. https://doi.org/10.1109/ICASSP.2016.7472669</mixed-citation></citation-alternatives></ref><ref id="cit13"><label>13</label><citation-alternatives><mixed-citation xml:lang="ru">Continuous Speech Emotion Recognition with Convolutional Neural Networks / N. Vryzas, L. Vrysis, M. Matsiola, R. Kotsakis, C. Dimoulas, G. Kalliris // Journal of the Audio Engineering Society. 2020. 68 (1/2). P. 14-24. https://doi.org/10.17743/jaes.2019.0043</mixed-citation><mixed-citation xml:lang="en">Vryzas N., Vrysis L., Matsiola M., Kotsakis R., Dimoulas C., Kalliris G. Continuous Speech Emotion Recognition with Convolutional Neural Networks. Journal of the Audio Engineering Society, 2020, no. 68(1/2), pp. 14-24. https://doi.org/10.17743/jaes.2019.0043</mixed-citation></citation-alternatives></ref><ref id="cit14"><label>14</label><citation-alternatives><mixed-citation xml:lang="ru">3-D convolutional recurrent neural networks with attention model for speech emotion recognition / M. Chen, X. He, J. Yang, H. Zhang // IEEE Signal Processing Letters. 2018. 25(10). P. 1440-1444. https://doi.org/10.1109/LSP.2018.2860246</mixed-citation><mixed-citation xml:lang="en">Chen M., He X., Yang J., Zhang H. 3¬D convolutional recurrent neural networks with attention model for speech emotion recognition. IEEE Signal Processing Letters, 2018, no. 25(10), pp.1440-1444. https://doi.org/10.1109/LSP.2018.2860246</mixed-citation></citation-alternatives></ref><ref id="cit15"><label>15</label><citation-alternatives><mixed-citation xml:lang="ru">Satt A., Rozenberg S., Hoory R. Efficient Emotion Recognition from Speech Using Deep Learning on Spectrograms // Interspeech. 2017. P. 1089-1093. https://doi.org/10.21437/Interspeech.2017-200</mixed-citation><mixed-citation xml:lang="en">Satt A., Rozenberg S., Hoory R. Efficient Emotion Recognition from Speech Using Deep Learning on Spectrograms. Interspeech, 2017, pp. 1089-1093. https://doi.org/10.21437/Interspeech.2017-200</mixed-citation></citation-alternatives></ref><ref id="cit16"><label>16</label><citation-alternatives><mixed-citation xml:lang="ru">Zhang Z., Wu B., Schuller B. Attention-augmented end-to-end multi-task learning for emotion prediction from speech // ICASSP 2019¬2019 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE, 2019. P. 6705-6709. https://doi.org/10.1109/ICASSP.2019.8682896</mixed-citation><mixed-citation xml:lang="en">Zhang Z., Wu B., Schuller B. Attention-augmented end-to-end multi-task learning for emotion prediction from speech. ICASSP 2019-2019 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE, 2019, pp. 6705-6709. https://doi.org/10.1109/ICASSP.2019.8682896</mixed-citation></citation-alternatives></ref><ref id="cit17"><label>17</label><citation-alternatives><mixed-citation xml:lang="ru">Affective video content analysis: A multidisciplinary insight / Y. Baveye, C. Chamaret, E. Dellandréa, L. Chen // IEEE Transactions on Affective Computing. 2017. 9(4). P. 396-409. https://doi.org/1-1.10.1109/TAFFC.2020.2983669</mixed-citation><mixed-citation xml:lang="en">Baveye Y., Chamaret C., Dellandréa E., Chen L. Affective video content analysis: A multidisciplinary insight. IEEE Transactions on Affective Computing, 2017, no. 9(4), pp. 396-409. https://doi.org/1-1.10.1109/TAFFC.2020.2983669</mixed-citation></citation-alternatives></ref><ref id="cit18"><label>18</label><citation-alternatives><mixed-citation xml:lang="ru">Caruana R. Multitask learning // Machine learning. 1997. 28(1). P. 41-75. https://doi.org/10.1023/A:1007379606734</mixed-citation><mixed-citation xml:lang="en">Caruana R. Multitask learning. Machine learning, 1997, no. 28(1), pp. 41-75. https://doi.org/10.1023/A:1007379606734</mixed-citation></citation-alternatives></ref><ref id="cit19"><label>19</label><citation-alternatives><mixed-citation xml:lang="ru">IEMOCAP: Interactive emotional dyadic motion capture database / C. Busso, M. Bulut, C.C. Lee, A. Kazemzadeh, E. Mower, S. Kim, J. Chang, S. Lee, S.S. Narayanan // Language resources and evaluation. 2008. 42(4). P. 335. https://doi.org/10.1007/s10579-008-9076-6</mixed-citation><mixed-citation xml:lang="en">Busso C., Bulut M., Lee C.C., Kazemzadeh A., Mower E., Kim S., Chang J., Lee S., Narayanan S.S. IEMOCAP: Interactive emotional dyadic motion capture database. Language resources and evaluation, 2008, no. 42(4), 335 p. https://doi.org/10.1007/s10579-008-9076-6</mixed-citation></citation-alternatives></ref><ref id="cit20"><label>20</label><citation-alternatives><mixed-citation xml:lang="ru">The Geneva minimalistic acoustic parameter set (GeMAPS) for voice research and affective computing / F. Eyben, K.R. Scherer, B.W. Schuller, J. Sundberg, E. André, C. Busso, L. Devillers, J. Epps, P. Laukka, S. Narayanan, K. Truong // IEEE transactions on affective computing. 2015. 7(2). P. 190-202. https://doi.org/10.1109/TAFFC.2015.2457417</mixed-citation><mixed-citation xml:lang="en">Eyben F., Scherer K.R., Schuller B.W., Sundberg J., André E., Busso C., Devillers L., Epps J., Laukka P., Narayanan S., Truong K. The Geneva minimalistic acoustic parameter set (GeMAPS) for voice research and affective computing. IEEE transactions on affective computing, 2015, no. 7(2), pp. 190-202. https://doi.org/10.1109/TAFFC.2015.2457417</mixed-citation></citation-alternatives></ref><ref id="cit21"><label>21</label><citation-alternatives><mixed-citation xml:lang="ru">The INTERSPEECH 2013 computational paralinguistics challenge: Social signals, conflict, emotion, autism / B. Schuller, S. Steidl, A. Batliner, A. Vinciarelli, K. Scherer, F. Ringeval, M. Chetouani, F. Weninger, F. Eyben, E. Marchi, M. Mortillaro, H. Salamin, A. Polychroniou, F. Valente, S. Kim // Proceedings INTERSPEECH 2013, 14th Annual Conference of the International Speech Communication Association, Lyon, France. 2013. URL: https://mediatum.ub.tum.de/doc/1189705/file.pdf</mixed-citation><mixed-citation xml:lang="en">Schuller B., Steidl S., Batliner A., Vinciarelli A., Scherer K., Ringeval F., Chetouani M., Weninger F., Eyben F., Marchi E., Mortillaro M., Salamin H., Polychroniou A., Valente F., Kim S. The INTERSPEECH 2013 computational paralinguistics challenge: Social signals, conflict, emotion, autism. Proceedings INTERSPEECH 2013, 14th Annual Conference of the International Speech Communication Association, Lyon, France, 2013. Available at: https://mediatum.ub.tum.de/doc/1189705/file.pdf</mixed-citation></citation-alternatives></ref><ref id="cit22"><label>22</label><citation-alternatives><mixed-citation xml:lang="ru">Akçay M.B., Oğuz K. Speech emotion recognition: Emotional models, databases, features, preprocessing methods, supporting modalities, and classifiers // Speech Communication. 2020. 116. P. 56-76. https://doi.org/10.1016/j.specom.2019.12.001</mixed-citation><mixed-citation xml:lang="en">Akçay M.B., Oğuz K. Speech emotion recognition: Emotional models, databases, features, preprocessing methods, supporting modalities, and classifiers. Speech Communication. 2020, no. 116, pp. 56-76. Available at: https://doi.org/10.1016/j.specom.2019.12.001</mixed-citation></citation-alternatives></ref><ref id="cit23"><label>23</label><citation-alternatives><mixed-citation xml:lang="ru">The relevance of feature type for the automatic classification of emotional user states: low level descriptors and functionals / B. Schuller, A. Batliner, D. Seppi, S. Steidl, T. Vogt, J. Wagner, L. Devillers, L. Vidrascu, N. Amir, L. V. Kessous Aharonson // Eighth Annual Conference of the International Speech Communication Association. 2007. P. 2253-2256. URL: https://www.isca-speech.org/archive/interspeech_2007/i07_2253.htm</mixed-citation><mixed-citation xml:lang="en">Schuller B., Batliner A., Seppi D., Steidl S., Vogt T., Wagner J., Devillers L., Vidrascu L., Amir N., Kessous L. Aharonson V. The relevance of feature type for the automatic classification of emotional user states: low level descriptors and functionals. Eighth Annual Conference of the International Speech Communication Association, 2007, pp. 2253-2256. Available at: https://www.isca-speech.org/archive/interspeech_2007/i07_2253.html</mixed-citation></citation-alternatives></ref><ref id="cit24"><label>24</label><citation-alternatives><mixed-citation xml:lang="ru">Introducing the RECOLA multimodal corpus of remote collaborative and affective interactions / F. Ringeval, A. Sonderegger, J. Sauer, D. Lalanne // 2013 10th IEEE international conference and workshops on automatic face and gesture recognition (FG). IEEE, 2013. P. 1-8. https://doi.org/10.1109/FG.2013.6553805</mixed-citation><mixed-citation xml:lang="en">Ringeval F., Sonderegger A., Sauer J., Lalanne D. Introducing the RECOLA multimodal corpus of remote collaborative and affective interactions. 2013 10th IEEE international conference and workshops on automatic face and gesture recognition (FG). IEEE, 2013, pp. 1-8. https://doi.org/10.1109/FG.2013.6553805</mixed-citation></citation-alternatives></ref><ref id="cit25"><label>25</label><citation-alternatives><mixed-citation xml:lang="ru">Sound classification using convolutional neural network and tensor deep stacking network / A. Khamparia, D. Gupta, N.G. Nguyen, A. Khanna, B. Pandey, P. Tiwari // IEEE Access. 2019. 7. P. 7717-7727. https://doi.org/10.1109/ACCESS.2018.2888882</mixed-citation><mixed-citation xml:lang="en">Khamparia A., Gupta D., Nguyen N.G., Khanna A., Pandey B., Tiwari P. Sound classification using convolutional neural network and tensor deep stacking network. IEEE Access, 2019; 7:7717-7727. https://doi.org/10.1109/ACCESS.2018.2888882</mixed-citation></citation-alternatives></ref><ref id="cit26"><label>26</label><citation-alternatives><mixed-citation xml:lang="ru">Speaker-independent Japanese isolated speech word recognition using TDRC features / N.S.S. Srinivas, N. Sugan, L.S. Kumar, M.K. Nath, A. Kanhe // 2018 International CET Conference on Control, Communication, and Computing (IC4). IEEE, 2018. P. 278-283. https://doi.org/10.1109/CETIC4.2018.8530947</mixed-citation><mixed-citation xml:lang="en">Srinivas N.S.S., Sugan N., Kumar L.S., Nath M.K., Kanhe A. Speaker-independent Japanese isolated speech word recognition using TDRC features. 2018 International CET Conference on Control, Communication, and Computing (IC4). IEEE, 2018, pp. 278¬283. https://doi.org/10.1109/CETIC4.2018.8530947</mixed-citation></citation-alternatives></ref><ref id="cit27"><label>27</label><citation-alternatives><mixed-citation xml:lang="ru">Speaker identification using FrFT-based spectrogram and RBF neural network / P. Li, Y. Li, D. Luo, H. Luo // 2015 34th Chinese Control Conference (CCC). IEEE, 2015. P. 3674-3679. https://doi.org/10.1109/ChiCC.2015.7260207</mixed-citation><mixed-citation xml:lang="en">Li P., Li Y., Luo D., Luo H. Speaker identification using FrFT¬based spectrogram and RBF neural network. 2015 34th Chinese Control Conference (CCC). IEEE, 2015, pp. 3674¬3679. https://doi.org/10.1109/ChiCC.2015.7260207</mixed-citation></citation-alternatives></ref><ref id="cit28"><label>28</label><citation-alternatives><mixed-citation xml:lang="ru">Speech emotion recognition for performance interaction / N. Vryzas, R. Kotsakis, A. Liatsou, C.A. Dimoulas, G. Kalliris // Journal of the Audio Engineering Society. 2018. 66(6). P. 457-467. https://doi.org/10.17743/jaes.2018.0036</mixed-citation><mixed-citation xml:lang="en">Vryzas N., Kotsakis R., Liatsou A., Dimoulas C.A., Kalliris G. Speech emotion recognition for performance interaction. Journal of the Audio Engineering Society, 2018, 66(6), pp.457-467. https://doi.org/10.17743/jaes.2018.0036</mixed-citation></citation-alternatives></ref><ref id="cit29"><label>29</label><citation-alternatives><mixed-citation xml:lang="ru">Attention-based models for speech recognition / J.K. Chorowski, D. Bahdanau, D. Serdyuk, K. Cho, Y. Bengio // Advances in neural information processing systems. 2015. 28. P. 577-585. URL: https://papers.nips.cc/paper/2015/hash/1068c6e4c8051cfd4e9ea8072e3189e2-Abstract.html</mixed-citation><mixed-citation xml:lang="en">Chorowski J.K., Bahdanau D., Serdyuk D., Cho K., Bengio Y. Attention-based models for speech recognition. Advances in neural information processing systems, 2015, 28, pp. 577-585. Available at: https://papers.nips.cc/paper/2015/hash/1068c6e4c8051cfd4e9ea8072e3189e2-Abstract.html</mixed-citation></citation-alternatives></ref><ref id="cit30"><label>30</label><citation-alternatives><mixed-citation xml:lang="ru">A database of German emotional speech / F. Burkhardt, A. Paeschke, M. Rolfes, W.F. Sendlmeier, B. Weiss // Ninth European Conference on Speech Communication and Technology. 2005. URL: https://www.isca-speech.org/archive/archive_papers/interspeech_2005/i05_1517.pdf</mixed-citation><mixed-citation xml:lang="en">Burkhardt F., Paeschke A., Rolfes M., Sendlmeier W.F., Weiss B. A database of German emotional speech. Ninth European Conference on Speech Communication and Technology, 2005. Available at: https://www.isca-speech.org/archive/archive_papers/interspeech_2005/i05_1517.pdf</mixed-citation></citation-alternatives></ref><ref id="cit31"><label>31</label><citation-alternatives><mixed-citation xml:lang="ru">Dropout: a simple way to prevent neural networks from overfitting / N. Srivastava, G. Hinton, A., Krizhevsky I. Sutskever, R. Salakhutdinov // The journal of machine learning research. 2014. 15(1). P. 1929-1958. https://dl.acm.org/doi/abs/10.5555/2627435.2670313</mixed-citation><mixed-citation xml:lang="en">Srivastava N., Hinton G., Krizhevsky A., Sutskever I., Salakhutdinov R. Dropout: a simple way to prevent neural networks from overfitting. The journal of machine learning research. 2014, no. 15(1), pp.1929¬1958. Available at: https://dl.acm.org/doi/abs/10.5555/2627435.2670313</mixed-citation></citation-alternatives></ref><ref id="cit32"><label>32</label><citation-alternatives><mixed-citation xml:lang="ru">Bilen H., Vedaldi A. Universal representations: The missing link between faces, text, planktons, and cat breeds //arXiv preprint arXiv:1701.07275. 2017.</mixed-citation><mixed-citation xml:lang="en">Bilen H., Vedaldi A. Universal representations: The missing link between faces, text, planktons, and cat breeds. arXiv preprint arXiv:1701.07275. 2017.</mixed-citation></citation-alternatives></ref><ref id="cit33"><label>33</label><citation-alternatives><mixed-citation xml:lang="ru">Das A., Hasegawa-Johnson M., Veselý K. Deep Auto-Encoder Based Multi-Task Learning Using Probabilistic Transcriptions // INTERSPEECH. 2017. P. 2073-2077. https://doi.org/10.21437/Interspeech.2017-582</mixed-citation><mixed-citation xml:lang="en">Das A., Hasegawa-Johnson M., Veselý K. Deep Auto-Encoder Based Multi-Task Learning Using Probabilistic Transcriptions. INTERSPEECH, 2017, pp. 2073-2077. https://doi.org/10.21437/Interspeech.2017-582</mixed-citation></citation-alternatives></ref><ref id="cit34"><label>34</label><citation-alternatives><mixed-citation xml:lang="ru">Sanh V., Wolf T., Ruder S. A hierarchical multi-task approach for learning embeddings from semantic tasks // Proceedings of the AAAI Conference on Artificial Intelligence. – 2019. 33. P. 6949-6956. https://doi.org/10.1609/aaai.v33i01.33016949</mixed-citation><mixed-citation xml:lang="en">Sanh V., Wolf T., Ruder S. A hierarchical multi-task approach for learning embeddings from semantic tasks. Proceedings of the AAAI Conference on Artificial Intelligence, 2019, no. 33. pp. 6949-6956. https://doi.org/10.1609/aaai.v33i01.33016949</mixed-citation></citation-alternatives></ref><ref id="cit35"><label>35</label><citation-alternatives><mixed-citation xml:lang="ru">Distral: Robust multitask reinforcement learning / Y. Teh, V. Bapst, W.M. Czarnecki, J. Quan, J. Kirkpatrick, R. Hadsell, N. Heess, R. Pascanu // Advances in Neural Information Processing Systems. 2017. 30. P. 4496-4506. URL: https://proceedings.neurips.cc/paper/2017/hash/0abdc563a06105aee3c6136871c9f4d1-Abstract.html</mixed-citation><mixed-citation xml:lang="en">Teh Y., Bapst V., Czarnecki W.M., Quan J., Kirkpatrick J., Hadsell R., Heess N., Pascanu R. Distral: Robust multitask reinforcement learning. Advances in Neural Information Processing Systems, 2017, no. 30, pp.4496-4506. Available at: https://proceedings.neurips.cc/paper/2017/hash/0abdc563a06105aee3c6136871c9f4d1-Abstract.html</mixed-citation></citation-alternatives></ref><ref id="cit36"><label>36</label><citation-alternatives><mixed-citation xml:lang="ru">Ranjan R., Patel V.M., Chellappa R. Hyperface: A deep multi-task learning framework for face detection, landmark localization, pose estimation, and gender recognition // IEEE Transactions on Pattern Analysis and Machine Intelligence. 2017. 41(1). P. 121-135. https://doi.org/10.1109/TPAMI.2017.2781233</mixed-citation><mixed-citation xml:lang="en">Ranjan R., Patel V.M., Chellappa R. Hyperface: A deep multi-task learning framework for face detection, landmark localization, pose estimation, and gender recognition. IEEE Transactions on Pattern Analysis and Machine Intelligence, 2017, no. 41(1), pp. 121-135. https://doi.org/10.1109/TPAMI.2017.2781233</mixed-citation></citation-alternatives></ref><ref id="cit37"><label>37</label><citation-alternatives><mixed-citation xml:lang="ru">Parthasarathy S., Busso C. Jointly Predicting Arousal, Valence and Dominance with Multi-Task Learning // Interspeech. 2017. P. 1103-1107. URL: https://www.iscaspeech.org/archive/Interspeech_2017/pdfs/1494.PDF</mixed-citation><mixed-citation xml:lang="en">Parthasarathy S., Busso C. Jointly Predicting Arousal, Valence and Dominance with Multi-Task Learning. Interspeech. 2017:1103-1107. Available at: https://www.iscaspeech.org/archive/Interspeech_2017/pdfs/1494.PDF</mixed-citation></citation-alternatives></ref><ref id="cit38"><label>38</label><citation-alternatives><mixed-citation xml:lang="ru">Progressive neural networks for transfer learning in emotion recognition / J. Gideon, S. Khorram, Z. Aldeneh, D. Dimitriadis, E.M. Provost // arXiv preprint arXiv:1706.03256. 2017.</mixed-citation><mixed-citation xml:lang="en">Gideon J., Khorram S., Aldeneh Z., Dimitriadis D., Provost E.M. Progressive neural networks for transfer learning in emotion recognition. arXiv preprint arXiv:1706.03256. 2017.</mixed-citation></citation-alternatives></ref><ref id="cit39"><label>39</label><citation-alternatives><mixed-citation xml:lang="ru">MSP-IMPROV: An acted corpus of dyadic interactions to study emotion perception / C. Busso, S. Parthasarathy, A. Burmania, M. AbdelWahab, N. Sadoughi, E.M. Provost // IEEE Transactions on Affective Computing. 2016. 8(1). P. 67-80. https://doi.org/10.1109/TAFFC.2016.2515617</mixed-citation><mixed-citation xml:lang="en">Busso C., Parthasarathy S., Burmania A., AbdelWahab M., Sadoughi N., Provost E.M. MSP-IMPROV: An acted corpus of dyadic interactions to study emotion perception. IEEE Transactions on Affective Computing,. 2016, no. 8(1), pp.67-80. https://doi.org/10.1109/TAFFC.2016.2515617</mixed-citation></citation-alternatives></ref><ref id="cit40"><label>40</label><citation-alternatives><mixed-citation xml:lang="ru">Kendall A., Gal Y., Cipolla R. Multi-task learning using uncertainty to weigh losses for scene geometry and semantics // Proceedings of the IEEE conference on computer vision and pattern recognition. 2018. P. 7482-7491. https://doi.org/10.1109/CVPR.2018.00781</mixed-citation><mixed-citation xml:lang="en">Kendall A., Gal Y., Cipolla R. Multi-task learning using uncertainty to weigh losses for scene geometry and semantics. Proceedings of the IEEE conference on computer vision and pattern recognition, 2018, pp.7482-7491. https://doi.org/10.1109/CVPR.2018.00781</mixed-citation></citation-alternatives></ref><ref id="cit41"><label>41</label><citation-alternatives><mixed-citation xml:lang="ru">Liebel L., Körner M. Auxiliary tasks in multi-task learning // arXiv preprint arXiv:1805.06334. 2018.</mixed-citation><mixed-citation xml:lang="en">Liebel L., Körner M. Auxiliary tasks in multi-task learning. arXiv preprint arXiv:1805.06334. 2018.</mixed-citation></citation-alternatives></ref><ref id="cit42"><label>42</label><citation-alternatives><mixed-citation xml:lang="ru">A comparison of loss weighting strategies for multi task learning in deep neural networks / T. Gong, T. Lee, C. Stephenson, V. Renduchintala, S. Padhy, A. Ndirango, G. Keskin, O.H. Elibol // IEEE Access. 2019. 7. P. 141627-141632. https://doi.org/10.1109/ACCESS.2019.2943604</mixed-citation><mixed-citation xml:lang="en">Gong T., Lee, T., Stephenson C., Renduchintala V., Padhy S., Ndirango A., Keskin G., Elibol O.H. A comparison of loss weighting strategies for multi task learning in deep neural networks. IEEE Access. 2019; 7:141627-141632. https://doi.org/10.1109/ACCESS.2019.294360</mixed-citation></citation-alternatives></ref><ref id="cit43"><label>43</label><citation-alternatives><mixed-citation xml:lang="ru">Liu S., Johns E., Davison A. J. End-to-end multi-task learning with attention // Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. 2019. P. 1871-1880. https://doi.org/10.1109/CVPR.2019.00197</mixed-citation><mixed-citation xml:lang="en">Liu S., Johns E., Davison A. J. End-to-end multi-task learning with attention. Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 2019, pp. 1871-1880. https://doi.org/10.1109/CVPR.2019.00197</mixed-citation></citation-alternatives></ref><ref id="cit44"><label>44</label><citation-alternatives><mixed-citation xml:lang="ru">Gradnorm: Gradient normalization for adaptive loss balancing in deep multitask networks / Z. Chen, V. Badrinarayanan, C.Y. Lee, A. Rabinovich // International Conference on Machine Learning. PMLR, 2018. P. 794-803. URL: http://proceedings.mlr.press/v80/chen18a.html</mixed-citation><mixed-citation xml:lang="en">Chen Z., Badrinarayanan V., Lee C.Y., Rabinovich A. Gradnorm: Gradient normalization for adaptive loss balancing in deep multitask networks. International Conference on Machine Learning. PMLR, 2018. pp.794-803. http://proceedings.mlr.press/v80/chen18a.html</mixed-citation></citation-alternatives></ref><ref id="cit45"><label>45</label><citation-alternatives><mixed-citation xml:lang="ru">Krizhevsky A., Sutskever I., Hinton G. E. Imagenet classification with deep convolutional neural networks // Communications of the ACM. 2017. 60(6). P. 84-90. URL: https://dl.acm.org/doi/abs/10.1145/3065386</mixed-citation><mixed-citation xml:lang="en">Krizhevsky A., Sutskever I., Hinton G. E. Imagenet classification with deep convolutional neural networks. Communications of the ACM, 2017, no. 60(6), pp.84¬90. https://dl.acm.org/doi/abs/10.1145/3065386</mixed-citation></citation-alternatives></ref><ref id="cit46"><label>46</label><citation-alternatives><mixed-citation xml:lang="ru">Simonyan K., Zisserman A. Very deep convolutional networks for large-scale image recognition //arXiv preprint arXiv:1409.1556. 2014.</mixed-citation><mixed-citation xml:lang="en">Simonyan K., Zisserman A. Very deep convolutional networks for large-scale image recognition. arXiv preprint arXiv:1409.1556. 2014.</mixed-citation></citation-alternatives></ref><ref id="cit47"><label>47</label><citation-alternatives><mixed-citation xml:lang="ru">He K. et al. Deep residual learning for image recognition // Proceedings of the IEEE conference on computer vision and pattern recognition. 2016. P. 770-778. URL: https://openaccess.thecvf.com/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html</mixed-citation><mixed-citation xml:lang="en">He K. et al. Deep residual learning for image recognition. Proceedings of the IEEE conference on computer vision and pattern recognition, 2016, pp. 770-778. Available at: https://openaccess.thecvf.com/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html</mixed-citation></citation-alternatives></ref><ref id="cit48"><label>48</label><citation-alternatives><mixed-citation xml:lang="ru">Kingma D.P., Ba J. Adam: A method for stochastic optimization // arXiv preprint arXiv:1412.6980. 2014.</mixed-citation><mixed-citation xml:lang="en">Kingma D.P., Ba J. Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980. 2014.</mixed-citation></citation-alternatives></ref><ref id="cit49"><label>49</label><citation-alternatives><mixed-citation xml:lang="ru">Livingstone S.R., Russo F.A. The Ryerson Audio-Visual Database of Emotional Speech and Song (RAVDESS): A dynamic, multimodal set of facial and vocal expressions in North American English // PloS one. 2018. 13(5). P. e0196391. https://doi.org/10.1371/journal.pone.0196391</mixed-citation><mixed-citation xml:lang="en">Livingstone S.R., Russo F.A. The Ryerson Audio-Visual Database of Emotional Speech and Song (RAVDESS): A dynamic, multimodal set of facial and vocal expressions in North American English. PloS one, 2018, no. 13(5):e0196391. https://doi.org/10.1371/journal.pone.0196391</mixed-citation></citation-alternatives></ref><ref id="cit50"><label>50</label><citation-alternatives><mixed-citation xml:lang="ru">Mariooryad S., Lotfian R., Busso C. Building a naturalistic emotional speech corpus by retrieving expressive behaviors from existing speech corpora // Fifteenth Annual Conference of the International Speech Communication Association. 2014. URL: https://www.iscaspeech.org/archive/interspeech_2014/i14_0238.html</mixed-citation><mixed-citation xml:lang="en">Mariooryad S., Lotfian R., Busso C. Building a naturalistic emotional speech corpus by retrieving expressive behaviors from existing speech corpora. Fifteenth Annual Conference of the International Speech Communication Association. 2014. Available at: https://www.isca-speech.org/archive/interspeech_2014/i14_0238.html</mixed-citation></citation-alternatives></ref><ref id="cit51"><label>51</label><citation-alternatives><mixed-citation xml:lang="ru">Maaten L., Hinton G. Visualizing data using t¬SNE // Journal of machine learning research. 2008. 9(Nov). P. 2579¬2605. URL: https://www.jmlr.org/papers/v9/vandermaaten08a.html</mixed-citation><mixed-citation xml:lang="en">Maaten L., Hinton G. Visualizing data using t-SNE. Journal of machine learning research, 2008, 9(Nov), pp. 2579-2605. Available at: https://www.jmlr.org/papers/v9/vandermaaten08a.html</mixed-citation></citation-alternatives></ref><ref id="cit52"><label>52</label><citation-alternatives><mixed-citation xml:lang="ru">Grad-cam: Visual explanations from deep networks via gradient-based localization / R.R. Sel-varaju, M. Cogswell, A. Das, R. Vedantam, D. Parikh, D. Batra // Proceedings of the IEEE international conference on computer vision. 2017. P. 618-626. URL: https://openaccess.thecvf.com/content_iccv_2017/html/Selvaraju_Grad-CAM_Visual_ Explanations_ICCV_2017_paper.html</mixed-citation><mixed-citation xml:lang="en">Selvaraju R.R., Cogswell M., Das A., Vedantam R., Parikh D., Batra D. Gradcam: Visual explanations from deep networks via gradient-based localization. Proceedings of the IEEE international conference on computer vision. 2017:618-626. Available at: https://openaccess.thecvf.com/content_iccv_2017/html/Selvaraju_Grad-CAM_Visual_Explanations_ICCV_2017_paper.html</mixed-citation></citation-alternatives></ref></ref-list><fn-group><fn fn-type="conflict"><p>The authors declare that there are no conflicts of interest present.</p></fn></fn-group></back></article>
