diff --git a/1-enrich-with-datacite/all_datacite_clients_for_uga.csv b/1-enrich-with-datacite/all_datacite_clients_for_uga.csv
index 48756bf651865705ffceabc0cb70672a63df1d80..7fdde2976b61aa0cbeaba700860bf0b3c15e9daa 100644
--- a/1-enrich-with-datacite/all_datacite_clients_for_uga.csv
+++ b/1-enrich-with-datacite/all_datacite_clients_for_uga.csv
@@ -1,5 +1,5 @@
 client,count,name,year,url
-cern.zenodo,791,Zenodo,2013,https://zenodo.org/
+cern.zenodo,798,Zenodo,2013,https://zenodo.org/
 inist.sshade,474,Solid Spectroscopy Hosting Architecture of Databases and Expertise,2019,https://www.sshade.eu/
 figshare.ars,255,figshare Academic Research System,2016,http://figshare.com/
 inist.osug,238,Observatoire des Sciences de l'Univers de Grenoble,2014,http://doi.osug.fr
@@ -7,7 +7,7 @@ dryad.dryad,160,DRYAD,2018,https://datadryad.org
 inist.resif,80,RÃ©seau sismologique et gÃ©odÃ©sique franÃ§ais,2014,https://www.resif.fr/
 inist.persyval,55,PERSYVAL-Lab : Pervasive Systems and Algorithms Lab,2016,
 rdg.prod,43,Recherche Data Gouv France,2022,https://recherche.data.gouv.fr/en
-inist.humanum,28,Huma-Num,2020,https://nakala.fr
+inist.humanum,29,Huma-Num,2020,https://nakala.fr
 fmsh.prod,28,Fondation Maison des sciences de l'homme,2023,
 figshare.sage,14,figshare SAGE Publications,2018,
 mcdy.dohrmi,12,dggv-e-publications,2020,https://www.dggv.de/publikationen/dggv-e-publikationen.html
diff --git a/1-enrich-with-datacite/nb-dois.txt b/1-enrich-with-datacite/nb-dois.txt
index 8377f99348cbd7e36b22b7e16060ef7b1479d1bd..cf548ec489e56015fa9c0738bb68c609eac2028d 100644
--- a/1-enrich-with-datacite/nb-dois.txt
+++ b/1-enrich-with-datacite/nb-dois.txt
@@ -1 +1 @@
-2211
\ No newline at end of file
+2219
\ No newline at end of file
diff --git a/2-produce-graph/hist-evol-datasets-per-repo.png b/2-produce-graph/hist-evol-datasets-per-repo.png
index c4fd7557ad994eb229972f01c190d4b51ebe5e5d..64e4f7a4c5b5d365d515d9bc9119dded9687cfe2 100644
Binary files a/2-produce-graph/hist-evol-datasets-per-repo.png and b/2-produce-graph/hist-evol-datasets-per-repo.png differ
diff --git a/2-produce-graph/hist-last-datasets-by-client.png b/2-produce-graph/hist-last-datasets-by-client.png
index ed6fa6dcb5442a773f3332a20dd35716f3384187..32599a1d3f08d9895341e20775caf875a92aabdf 100644
Binary files a/2-produce-graph/hist-last-datasets-by-client.png and b/2-produce-graph/hist-last-datasets-by-client.png differ
diff --git a/2-produce-graph/hist-quantity-year-type.png b/2-produce-graph/hist-quantity-year-type.png
index a254b70ccccfca3e4dac25c683e05edff8cd0c07..3639f813d44fecbed6b25976fba98a9bf2cb1bd5 100644
Binary files a/2-produce-graph/hist-quantity-year-type.png and b/2-produce-graph/hist-quantity-year-type.png differ
diff --git a/2-produce-graph/pie--datacite-client.png b/2-produce-graph/pie--datacite-client.png
index b36bd512174f8f1359953795fe6ea64e600baf13..1c1f01621dd280e5e49b339ade19a5ff58cee386 100644
Binary files a/2-produce-graph/pie--datacite-client.png and b/2-produce-graph/pie--datacite-client.png differ
diff --git a/2-produce-graph/pie--datacite-type.png b/2-produce-graph/pie--datacite-type.png
index 63ec77cb9a355a5e2f077a490730f31c69da59f6..59efdc9b0abd5bb9260612f0fd15c0ed0b8f5b00 100644
Binary files a/2-produce-graph/pie--datacite-type.png and b/2-produce-graph/pie--datacite-type.png differ
diff --git a/dois-uga.csv b/dois-uga.csv
index 16e11205be8976103426ef75d8cfcbbbdef0bc28..6a9e4d9fb40293e371f17b11767e3ef499c9a358 100644
--- a/dois-uga.csv
+++ b/dois-uga.csv
@@ -7926,3 +7926,565 @@ Each file is in .txt format, the decimal separator is a point '.' and the column
  under climate change, with great potential to alter forest functioning and
  community dynamics.",mds,True,findable,0,0,0,0,0,2024-04-23T15:10:23.000Z,2024-04-23T15:10:24.000Z,dryad.dryad,dryad,"FOS: Biological sciences,FOS: Biological sciences,European beech,Fagus sylvatica,Fecundity,masting,Climate change,seed production,Plant reproduction,MASTREE+","[{'subject': 'FOS: Biological sciences', 'subjectScheme': 'fos'}, {'subject': 'FOS: Biological sciences', 'schemeUri': 'http://www.oecd.org/science/inno/38235147.pdf', 'subjectScheme': 'Fields of Science and Technology (FOS)'}, {'subject': 'European beech'}, {'subject': 'Fagus sylvatica'}, {'subject': 'Fecundity', 'schemeUri': 'https://github.com/PLOS/plos-thesaurus', 'subjectScheme': 'PLOS Subject Area Thesaurus'}, {'subject': 'masting'}, {'subject': 'Climate change', 'schemeUri': 'https://github.com/PLOS/plos-thesaurus', 'subjectScheme': 'PLOS Subject Area Thesaurus'}, {'subject': 'seed production'}, {'subject': 'Plant reproduction', 'schemeUri': 'https://github.com/PLOS/plos-thesaurus', 'subjectScheme': 'PLOS Subject Area Thesaurus'}, {'subject': 'MASTREE+'}]",['358827 bytes'],
 10.5281/zenodo.11068959,Visualizando A Secagem De Concretos RefratÃ¡rios Via Tomografia De NÃªutrons,Zenodo,2024,,Text,Creative Commons Attribution 4.0 International,"RefratÃ¡rios monolÃticos sÃ£o uma alternativa aos materiais conformados (tijolos) e apresentam um grande potencial de reduÃ§Ã£o de consumo de energia e emissÃ£o de gases do efeito estufa, uma vez que nÃ£o necessitam de uma prÃ©-queima, antes de seu aquecimento atÃ© a temperatura de operaÃ§Ã£o. Dentro desta categoria, os refratÃ¡rios com ligantes hidrÃ¡ulicos sÃ£o definidos a partir de sua alta resistÃªncia mecÃ¢nica a verde decorrente das reaÃ§Ãµes de hidrataÃ§Ã£o do seu ligante durante a etapa de cura. Como as temperaturas de operaÃ§Ã£o dos refratÃ¡rios sÃ£o muito superiores que a da ebuliÃ§Ã£o da Ã¡gua, o processo de secagem Ã© intrÃnseco aos refratÃ¡rios hidrÃ¡ulicos. Esta etapa tambÃ©m Ã© a mais longa dentro do processo de instalaÃ§Ã£o de revestimentos monolÃticos devido ao risco de explosÃµes resultantes da combinaÃ§Ã£o de pressurizaÃ§Ã£o do vapor de Ã¡gua e de tensÃµes termomecÃ¢nicas. Assim, o entendimento de como a remoÃ§Ã£o desta Ã¡gua se dÃ¡ do meio poroso parcialmente saturado Ã© de grande interesse para permitir a otimizaÃ§Ã£o deste processo. O presente trabalho propÃµe o uso da tÃ©cnica de tomografia de nÃªutrons para a visualizaÃ§Ã£o direta da secagem. Foi possÃvel propor uma configuraÃ§Ã£o de secagem unidirecional, necessÃ¡ria para impedir o aparecimento de efeitos nÃ£o-fÃsicos nos resultados e tambÃ©m aproximar as condiÃ§Ãµes laboratoriais daquela encontrada na realidade industrial. AlÃ©m da visualizaÃ§Ã£o direta, a anÃ¡lise quantitativa do acÃºmulo de Ã¡gua foi possÃvel, bem como a proposta de mapas de secagem, nos quais se evidenciam as frentes de secagem, as regiÃµes de acÃºmulo de Ã¡gua e sua evoluÃ§Ã£o no tempo. Adicionalmente, os resultados obtidos por tal tÃ©cnica sÃ£o de suma importÃ¢ncia para aperfeiÃ§oar os modelos numÃ©ricos disponÃveis na literatura.",api,True,findable,0,0,0,0,0,2024-04-25T20:30:19.000Z,2024-04-25T20:30:19.000Z,cern.zenodo,cern,,,,
+10.34847/nkl.ddd67398,Pietro da Montagnana - Ploutos,NAKALA - https://nakala.fr (Huma-Num - CNRS),2024,la,Text,,Traduction en latin de la piÃ¨ce de thÃ©Ã¢tre Ploutos d'Aristophane par Pietro da Montagnana (15e siÃ¨cle),api,True,findable,0,0,0,0,0,2024-05-03T11:50:55.000Z,2024-05-03T11:50:55.000Z,inist.humanum,jbru,"TEI,Traduction,Aristophane,15e siÃ¨cle,Ploutos,latin,Pietro da Montagnana","[{'lang': 'fr', 'subject': 'TEI'}, {'lang': 'fr', 'subject': 'Traduction'}, {'lang': 'fr', 'subject': 'Aristophane'}, {'lang': 'fr', 'subject': '15e siÃ¨cle'}, {'lang': 'fr', 'subject': 'Ploutos'}, {'lang': 'fr', 'subject': 'latin'}, {'lang': 'fr', 'subject': 'Pietro da Montagnana'}]",['30687 Bytes'],['text/xml']
+10.5281/zenodo.11064977,Data from : Tree inventory data from permanent plots in French forest reserves,Zenodo,2024,en,Dataset,Creative Commons Attribution 4.0 International,"We present a dataset resulting from the first round of a national monitoring program of forest reserves. It contains 9538 permanent plots, distributed across 111 study sites in mainland France (including Corsica). Notably focusing on dead wood measurement, this protocol has primarily been applied in strict forest reserves and special nature reserves (sensu Bollmann et Braunisch 2013), with 68% (6494) of the plots being currently located in strict forest reserves (unmanaged) and 24,7% (2363 plots) in forests unmanaged for at least 50 years. Sites cover a large variety of ecological conditions, from lowland to subalpine forests, but with an underrepresentation of Mediterranean forests (Table 1). The protocol assesses all the stages of a tree's life cycle, from seedling to decomposed lying dead wood. On each plot, a combination of three sampling techniques was used: (i) fixed area inventory for regeneration, standing dead trees, living trees and coarse woody debris (CWD) with diameter over 30 cm, (ii) transect lines for CWD with diameter < 30 cm, and (iii) fixed angle plot method for living trees with a diameter at breast height (DBH) > 30 cm (using a relascopic angle of 3%). Measurements include: exact tree location (azimuth, distance), species, diameter(s), tree-related microhabitats, decay stage and bark cover, seedling cover. With the ongoing climate change, the program network can also provide important information to monitor changes in forest ecosystems. It can also be used as forest management monitoring or conservation status assessment.",api,True,findable,0,0,0,0,1,2024-04-29T07:09:43.000Z,2024-04-29T07:09:43.000Z,cern.zenodo,cern,,,,
+10.5281/zenodo.11064978,Data from : Tree inventory data from permanent plots in French forest reserves,Zenodo,2024,en,Dataset,Creative Commons Attribution 4.0 International,"We present a dataset resulting from the first round of a national monitoring program of forest reserves. It contains 9538 permanent plots, distributed across 111 study sites in mainland France (including Corsica). Notably focusing on dead wood measurement, this protocol has primarily been applied in strict forest reserves and special nature reserves (sensu Bollmann et Braunisch 2013), with 68% (6494) of the plots being currently located in strict forest reserves (unmanaged) and 24,7% (2363 plots) in forests unmanaged for at least 50 years. Sites cover a large variety of ecological conditions, from lowland to subalpine forests, but with an underrepresentation of Mediterranean forests (Table 1). The protocol assesses all the stages of a tree's life cycle, from seedling to decomposed lying dead wood. On each plot, a combination of three sampling techniques was used: (i) fixed area inventory for regeneration, standing dead trees, living trees and coarse woody debris (CWD) with diameter over 30 cm, (ii) transect lines for CWD with diameter < 30 cm, and (iii) fixed angle plot method for living trees with a diameter at breast height (DBH) > 30 cm (using a relascopic angle of 3%). Measurements include: exact tree location (azimuth, distance), species, diameter(s), tree-related microhabitats, decay stage and bark cover, seedling cover. With the ongoing climate change, the program network can also provide important information to monitor changes in forest ecosystems. It can also be used as forest management monitoring or conservation status assessment.",api,True,findable,0,0,0,0,0,2024-04-29T07:09:43.000Z,2024-04-29T07:09:43.000Z,cern.zenodo,cern,,,,
+10.5281/zenodo.10980636,M-POPP datasets: Datasets for full page text recognition and information extraction from French handwritten and printed marriage records,Zenodo,2024,fr,Dataset,Creative Commons Attribution 4.0 International,"M-POPP datasets
+
+This repository contains 2 datasets created within the EXO-POPP project (Optical EXtraction of handwritten named entities for marriage records of the POPulation of Paris) for the task of text recognition and information extraction. These datasets have been published in End-to-end information extraction in handwritten documents: Understanding Paris marriage records from 1880 to 1940 [1]at ICDAR 2024.
+
+The EXO-POPP project aims to establish a comprehensive database comprising 300,000 marriage records from Paris and its suburbs, spanning the years 1880 to 1940, which are preserved in over 130,000 scans of double pages. Each marriage record may encompass up to 118 distinct types of information that require extraction from plain text. The M-POPP corpus (which stands for Marriage records of the POPulation of Paris) is the corpus on which the EXO-POPP project focuses. This corpus was built by gathering the marriage records of Paris and its suburb regions (Hauts- de-Seine, Seine-Saint-Denis, Val-de-Marne).
+
+The M-POPP corpus are a subset of the M-POPP database with annotations for full-page text recognition and named entity recognition/information extraction from both handwritten and printed documents. The first dataset comprises handwritten marriage records, while the second dataset consists of typewritten marriage records. It should be noted that even in typewritten marriage records, some handwritten information occurs, especially concerning the names of the spouses, and notes in the margin.The dataset contains single-page images obtained from the original scans of double pages via page segmentation.
+
+The structure of the files is the following:
+
+
+
+handwritten:Â the handwritten dataset
+
+
+
+images: images of the dataset divided following the split used in [1]
+
+
+
+train
+
+valid
+
+test
+
+
+
+labels:Â labels for joint handwritten text recognition and information extraction for each encoding tested in [1]
+
+
+
+printed: the printed dataset
+
+
+
+images:Â images of the dataset divided following the split used in [1]
+
+
+
+train
+
+valid
+
+test
+
+
+
+labels:Â labels for joint handwritten text recognition and information extraction for each encoding tested in [1]
+
+
+
+encoding-2-to-encoding-5.json:Â a JSON file giving the correspondence between the symbols of encoding 2 and encoding 5.
+
+
+Â 
+
+Table 1: Details on the split of the handwritten dataset.
+
+
+
+
+Â 
+Train
+Validation
+Test
+
+
+Pages
+250
+32
+32
+
+
+Acts
+344
+51
+53
+
+
+Named entities
+16727
+2223
+2517
+
+
+
+
+Â 
+
+Table 2: Details on the split of the printed dataset.
+
+
+
+
+Â 
+Train
+Validation
+Test
+
+
+Pages
+116
+14
+13
+
+
+Acts
+363
+43
+30
+
+
+Named entities
+22036
+2559
+2405
+
+
+
+
+Â 
+
+Table 3: Average annotation statistics per act for the two M-POPP datasets.
+
+
+
+
+Dataset
+# of characters
+# of words
+# of named entities
+
+
+Handwritten
+1519
+231
+48
+
+
+Printed
+1328
+200
+60
+
+
+
+
+Â 
+
+Document structure Annotation
+
+We employ the procedure applied in [2], which involves adding opening and closing tags to the character set for each text block we want to recognize.In total, we define four types of text blocks.
+
+
+
+Block A is located in the margin and contains the last names of the married couple, possibly with their first names and the date of the marriage.
+
+Block B is the body of the text. Block B is the one that contains most of the information to be extracted.
+
+Block C is optional and corresponds to marginal notes used in various cases, such as the mention of a divorce or a correction made to the act.
+
+Block D corresponds to a set containing a block A and a block B, optionally with one or more blocks C.
+
+
+Â 
+
+Information Extraction annotation
+
+The dataset contains 118 information categories. As explained in the paper, we broke down the named entities into sub-elements pertaining to 4 hierarchical levels, which reduces the total number of categories to 23 instead of 118. Notice that level 1, 2, and 3 categories do not encode named entities but rather the relations that may occur between some lower level categories for example: (day, birth, husband) encodes the fact that the annotated piece of text is the date of birth of the husband.Â 
+
+For these datasets, we chose to represent these hierarchical elements with emojis. For instance, the information first name is represented by the emoji ðŸ’¬.The meaning of each emoji can be found in Table 4. To determine the best way to encode named entities in the ground truth, we compared in [1] 5 types of encoding. To illustrate these encodings, letâ€™s take for instanceÂ Louis Alexandre MOUDEL that we define as the father of the bride, where Louis Alexandre are his two first names, and Moudel is his last name.Â 
+
+1) Single separate tags before each word: In this approach, each level of information is indicated by a dedicated tag, and the tags are placed before the word they encode information for. With this encoding, the ground truth for the example would be:
+
+ðŸ’¬ðŸ‘´ðŸ‘°Louis Â  ðŸ’¬ðŸ‘´ðŸ‘°AlexandreÂ  ðŸ—¨ï¸ðŸ‘´ðŸ‘°MOUDEL
+
+2) Single separate tags after each word: Similar to the previous approach, except here the tags are placed after the word. With this encoding the previous example becomes:
+
+LouisðŸ‘°ðŸ‘´ðŸ’¬Â  AlexandreðŸ‘°ðŸ‘´ðŸ’¬Â  MOUDELðŸ‘°ðŸ‘´ðŸ—¨ï¸
+
+3) Open & close separate tags: Here, each word presenting information to be extracted is surrounded by one or more opening and closing tags, where each tag encodes a level of information. So the example would be as:
+
+<ðŸ‘°> <ðŸ‘´> <ðŸ’¬> Louis <\ðŸ’¬> <\ðŸ‘´> <\ðŸ‘°><ðŸ‘°> <ðŸ‘´> <ðŸ’¬> Alexandre <\ðŸ’¬> <\ðŸ‘´> <\ðŸ‘°><ðŸ‘°> <ðŸ‘´> <ðŸ—¨ï¸> MOUDEL <\ðŸ—¨ï¸> <\ðŸ‘´> <\ðŸ‘°>
+
+4) Nested open & close separate tags: Similar to the previous approach, but this time a tag is closed only when the encoded information is no longer the same for that level of information. We can see in the example below that the tags for wife and father are only used twice.
+
+<ðŸ‘°> <ðŸ‘´> <ðŸ’¬> Louis Alexandre <\ðŸ’¬> <ðŸ—¨ï¸> MOUDEL <\ðŸ—¨ï¸>
+
+5) Single combined tags after each word: In the last approach, one tag encodes all the hierarchical levels constituting information. The tags are located after the word they encode information for.Â 
+
+Louis<wife_father_first_name>Â  Alexandre<wife_father_first_name>Â  MOUDEL<wife_father_family_name>
+
+NB: In the labels file of encoding 5, the information are still encoded with emojis but the chosen emojis do not have a semantic meaning due to the number of information categories to be represented. The correspondence between the symbols of encoding 2 and encoding 5 can be found in the fileÂ encoding-2-to-encoding-5.json.
+
+Â 
+
+Table 4: Details of the hierarchical breakdown of named entities. Each tag is placed in the corresponding hierarchical level and associated with the emoji representing it.
+
+
+
+
+Level
+Tags
+Â 
+Â 
+Â 
+
+
+1
+Administrative ðŸ“–
+
+
+Husband ðŸ‘¨
+
+Wife ðŸ‘°
+Witness ðŸ¥¸
+
+
+2
+Father ðŸ‘´
+Mother ðŸ‘µ
+Ex-husband ðŸ’”
+Â 
+
+
+3
+Birth ðŸ¥
+Residence ðŸ 
+Â 
+Â 
+
+
+4
+First name ðŸ’¬
+Family name ðŸ—¨ï¸
+Age âŒ›
+Occupation ðŸ”§
+
+
+5
+Street number ðŸ”Ÿ
+Street type ðŸ›£
+Street name ðŸ” 
+City ðŸŒ†
+
+
+Â 
+Department ðŸ—º
+Country ðŸ—º
+Day ðŸŒž
+Month ðŸ“…
+
+
+Â 
+Year ðŸ—“
+Hour â°
+Minute â±
+Â 
+
+
+
+
+Â 
+
+Â 
+
+Citation Request
+
+If you publish material based on this database, we request you to include a reference to the paperÂ T. Constum, L. Preel, T. Paquet, P. Tranouez, S. BrÃ©e, End-to-end information extraction in handwritten documents: Understanding Paris marriage records from 1880 to 1940, International Conference on Document Analysis and Recognition (ICDAR), Athens, Greece, 2024.
+
+Â 
+
+Bibliography
+
+1: T. Constum, L. Preel, T. Paquet, P. Tranouez, S. BrÃ©e, End-to-end information extraction in handwritten documents: Understanding Paris marriage records from 1880 to 1940, International Conference on Document Analysis and Recognition (ICDAR), Athens, Greece, 2024.
+
+2: D.Coquenet, C. Chatelain, T. Paquet: DAN: a Segmentation-free Document Attention Network for Handwritten Document Recognition. IEEE Transactions on Pattern Analysis and Machine Intelligence pp. 1â€“17 (2023).",api,True,findable,0,0,0,0,0,2024-04-30T13:04:07.000Z,2024-04-30T13:04:08.000Z,cern.zenodo,cern,"handwriting text recognition,document understanding,named entity recognition,information extraction","[{'subject': 'handwriting text recognition'}, {'subject': 'document understanding'}, {'subject': 'named entity recognition'}, {'subject': 'information extraction'}]",,
+10.5281/zenodo.11109612,"Simulations and scripts for ""Glacier surges controlled by the close interplay between subglacial friction and drainage"".",Zenodo,2024,,Dataset,Creative Commons Attribution 4.0 International,"This repository contains the model and scripts to reproduce the results presented in ""Glacier surges controlled by the close interplay between subglacial friction and drainage"" and submitted to the Journal of Geophysical Research - Earth Surface. It provides the running model files associated with each result figure of the manuscript as well as the Python script to generate them from the simulation output. The model is also described and updated at: https://github.com/kjetilthogersen/pyGlacier.",api,True,findable,0,0,0,0,0,2024-05-03T12:54:02.000Z,2024-05-03T12:54:02.000Z,cern.zenodo,cern,,,,
+10.5281/zenodo.11115059,NeoGeographyToolkit/StereoPipeline: 2024-05-05-daily-build,Zenodo,2024,,Software,Creative Commons Attribution 4.0 International,Recent additions log: https://stereopipeline.readthedocs.io/en/latest/news.html,api,True,findable,0,0,0,0,0,2024-05-05T08:16:10.000Z,2024-05-05T08:16:10.000Z,cern.zenodo,cern,,,,
+10.5281/zenodo.10980635,M-POPP datasets: Datasets for full page text recognition and information extraction from French handwritten and printed marriage records,Zenodo,2024,fr,Dataset,Creative Commons Attribution 4.0 International,"M-POPP datasets
+
+This repository contains 2 datasets created within the EXO-POPP project (Optical EXtraction of handwritten named entities for marriage records of the POPulation of Paris) for the task of text recognition and information extraction. These datasets have been published in End-to-end information extraction in handwritten documents: Understanding Paris marriage records from 1880 to 1940 [1]at ICDAR 2024.
+
+The EXO-POPP project aims to establish a comprehensive database comprising 300,000 marriage records from Paris and its suburbs, spanning the years 1880 to 1940, which are preserved in over 130,000 scans of double pages. Each marriage record may encompass up to 118 distinct types of information that require extraction from plain text. The M-POPP corpus (which stands for Marriage records of the POPulation of Paris) is the corpus on which the EXO-POPP project focuses. This corpus was built by gathering the marriage records of Paris and its suburb regions (Hauts- de-Seine, Seine-Saint-Denis, Val-de-Marne).
+
+The M-POPP corpus are a subset of the M-POPP database with annotations for full-page text recognition and named entity recognition/information extraction from both handwritten and printed documents. The first dataset comprises handwritten marriage records, while the second dataset consists of typewritten marriage records. It should be noted that even in typewritten marriage records, some handwritten information occurs, especially concerning the names of the spouses, and notes in the margin.The dataset contains single-page images obtained from the original scans of double pages via page segmentation.
+
+The structure of the files is the following:
+
+
+
+handwritten:Â the handwritten dataset
+
+
+
+images: images of the dataset divided following the split used in [1]
+
+
+
+train
+
+valid
+
+test
+
+
+
+labels:Â labels for joint handwritten text recognition and information extraction for each encoding tested in [1]
+
+
+
+printed: the printed dataset
+
+
+
+images:Â images of the dataset divided following the split used in [1]
+
+
+
+train
+
+valid
+
+test
+
+
+
+labels:Â labels for joint handwritten text recognition and information extraction for each encoding tested in [1]
+
+
+
+encoding-2-to-encoding-5.json:Â a JSON file giving the correspondence between the symbols of encoding 2 and encoding 5.
+
+
+Â 
+
+Table 1: Details on the split of the handwritten dataset.
+
+
+
+
+Â 
+Train
+Validation
+Test
+
+
+Pages
+250
+32
+32
+
+
+Acts
+344
+51
+53
+
+
+Named entities
+16727
+2223
+2517
+
+
+
+
+Â 
+
+Table 2: Details on the split of the printed dataset.
+
+
+
+
+Â 
+Train
+Validation
+Test
+
+
+Pages
+116
+14
+13
+
+
+Acts
+363
+43
+30
+
+
+Named entities
+22036
+2559
+2405
+
+
+
+
+Â 
+
+Table 3: Average annotation statistics per act for the two M-POPP datasets.
+
+
+
+
+Dataset
+# of characters
+# of words
+# of named entities
+
+
+Handwritten
+1519
+231
+48
+
+
+Printed
+1328
+200
+60
+
+
+
+
+Â 
+
+Document structure Annotation
+
+We employ the procedure applied in [2], which involves adding opening and closing tags to the character set for each text block we want to recognize.In total, we define four types of text blocks.
+
+
+
+Block A is located in the margin and contains the last names of the married couple, possibly with their first names and the date of the marriage.
+
+Block B is the body of the text. Block B is the one that contains most of the information to be extracted.
+
+Block C is optional and corresponds to marginal notes used in various cases, such as the mention of a divorce or a correction made to the act.
+
+Block D corresponds to a set containing a block A and a block B, optionally with one or more blocks C.
+
+
+Â 
+
+Information Extraction annotation
+
+The dataset contains 118 information categories. As explained in the paper, we broke down the named entities into sub-elements pertaining to 4 hierarchical levels, which reduces the total number of categories to 23 instead of 118. Notice that level 1, 2, and 3 categories do not encode named entities but rather the relations that may occur between some lower level categories for example: (day, birth, husband) encodes the fact that the annotated piece of text is the date of birth of the husband.Â 
+
+For these datasets, we chose to represent these hierarchical elements with emojis. For instance, the information first name is represented by the emoji ðŸ’¬.The meaning of each emoji can be found in Table 4. To determine the best way to encode named entities in the ground truth, we compared in [1] 5 types of encoding. To illustrate these encodings, letâ€™s take for instanceÂ Louis Alexandre MOUDEL that we define as the father of the bride, where Louis Alexandre are his two first names, and Moudel is his last name.Â 
+
+1) Single separate tags before each word: In this approach, each level of information is indicated by a dedicated tag, and the tags are placed before the word they encode information for. With this encoding, the ground truth for the example would be:
+
+ðŸ’¬ðŸ‘´ðŸ‘°Louis Â  ðŸ’¬ðŸ‘´ðŸ‘°AlexandreÂ  ðŸ—¨ï¸ðŸ‘´ðŸ‘°MOUDEL
+
+2) Single separate tags after each word: Similar to the previous approach, except here the tags are placed after the word. With this encoding the previous example becomes:
+
+LouisðŸ‘°ðŸ‘´ðŸ’¬Â  AlexandreðŸ‘°ðŸ‘´ðŸ’¬Â  MOUDELðŸ‘°ðŸ‘´ðŸ—¨ï¸
+
+3) Open & close separate tags: Here, each word presenting information to be extracted is surrounded by one or more opening and closing tags, where each tag encodes a level of information. So the example would be as:
+
+<ðŸ‘°> <ðŸ‘´> <ðŸ’¬> Louis <\ðŸ’¬> <\ðŸ‘´> <\ðŸ‘°><ðŸ‘°> <ðŸ‘´> <ðŸ’¬> Alexandre <\ðŸ’¬> <\ðŸ‘´> <\ðŸ‘°><ðŸ‘°> <ðŸ‘´> <ðŸ—¨ï¸> MOUDEL <\ðŸ—¨ï¸> <\ðŸ‘´> <\ðŸ‘°>
+
+4) Nested open & close separate tags: Similar to the previous approach, but this time a tag is closed only when the encoded information is no longer the same for that level of information. We can see in the example below that the tags for wife and father are only used twice.
+
+<ðŸ‘°> <ðŸ‘´> <ðŸ’¬> Louis Alexandre <\ðŸ’¬> <ðŸ—¨ï¸> MOUDEL <\ðŸ—¨ï¸>
+
+5) Single combined tags after each word: In the last approach, one tag encodes all the hierarchical levels constituting information. The tags are located after the word they encode information for.Â 
+
+Louis<wife_father_first_name>Â  Alexandre<wife_father_first_name>Â  MOUDEL<wife_father_family_name>
+
+NB: In the labels file of encoding 5, the information are still encoded with emojis but the chosen emojis do not have a semantic meaning due to the number of information categories to be represented. The correspondence between the symbols of encoding 2 and encoding 5 can be found in the fileÂ encoding-2-to-encoding-5.json.
+
+Â 
+
+Table 4: Details of the hierarchical breakdown of named entities. Each tag is placed in the corresponding hierarchical level and associated with the emoji representing it.
+
+
+
+
+Level
+Tags
+Â 
+Â 
+Â 
+
+
+1
+Administrative ðŸ“–
+
+
+Husband ðŸ‘¨
+
+Wife ðŸ‘°
+Witness ðŸ¥¸
+
+
+2
+Father ðŸ‘´
+Mother ðŸ‘µ
+Ex-husband ðŸ’”
+Â 
+
+
+3
+Birth ðŸ¥
+Residence ðŸ 
+Â 
+Â 
+
+
+4
+First name ðŸ’¬
+Family name ðŸ—¨ï¸
+Age âŒ›
+Occupation ðŸ”§
+
+
+5
+Street number ðŸ”Ÿ
+Street type ðŸ›£
+Street name ðŸ” 
+City ðŸŒ†
+
+
+Â 
+Department ðŸ—º
+Country ðŸ—º
+Day ðŸŒž
+Month ðŸ“…
+
+
+Â 
+Year ðŸ—“
+Hour â°
+Minute â±
+Â 
+
+
+
+
+Â 
+
+Â 
+
+Citation Request
+
+If you publish material based on this database, we request you to include a reference to the paperÂ T. Constum, L. Preel, T. Paquet, P. Tranouez, S. BrÃ©e, End-to-end information extraction in handwritten documents: Understanding Paris marriage records from 1880 to 1940, International Conference on Document Analysis and Recognition (ICDAR), Athens, Greece, 2024.
+
+Â 
+
+Bibliography
+
+1: T. Constum, L. Preel, T. Paquet, P. Tranouez, S. BrÃ©e, End-to-end information extraction in handwritten documents: Understanding Paris marriage records from 1880 to 1940, International Conference on Document Analysis and Recognition (ICDAR), Athens, Greece, 2024.
+
+2: D.Coquenet, C. Chatelain, T. Paquet: DAN: a Segmentation-free Document Attention Network for Handwritten Document Recognition. IEEE Transactions on Pattern Analysis and Machine Intelligence pp. 1â€“17 (2023).",api,True,findable,0,0,0,0,1,2024-04-30T13:04:08.000Z,2024-04-30T13:04:08.000Z,cern.zenodo,cern,"handwriting text recognition,document understanding,named entity recognition,information extraction","[{'subject': 'handwriting text recognition'}, {'subject': 'document understanding'}, {'subject': 'named entity recognition'}, {'subject': 'information extraction'}]",,
+10.5281/zenodo.11106596,Nonequilibrium Andreev resonances in ultraclean graphene Andreev interferometers,Zenodo,2024,,Dataset,Creative Commons Attribution 4.0 International,,api,True,findable,0,0,0,0,0,2024-05-02T20:26:03.000Z,2024-05-02T20:26:03.000Z,cern.zenodo,cern,,,,