Commit 92152051 authored by Sylvain Hatier's avatar Sylvain Hatier Committed by Arnaud Bey
Browse files

Script perl import lexicons

parent 5521d86d
......@@ -40,7 +40,7 @@
"argparse": {
"version": "1.0.10",
"resolved": "https://registry.npmjs.org/argparse/-/argparse-1.0.10.tgz",
"integrity": "sha512-o5Roy6tNG4SL/FOkCAN6RzjiakZS25RLYFrcMttJqbdd8BWrnA+fGz57iN5Pb06pvBGvl5gQ0B48dJlslXvoTg==",
"integrity": "sha1-vNZ5HqWuCXJeF+WtmIE0zUCz2RE=",
"requires": {
"sprintf-js": "~1.0.2"
}
......@@ -151,7 +151,7 @@
"brace-expansion": {
"version": "1.1.11",
"resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-1.1.11.tgz",
"integrity": "sha512-iCuPHDFgrHX7H2vEI/5xpz07zSHB00TpugqhmYtVmMO6518mCuRMoOYFldEBl0g187ufozdaHgWKcYFb61qGiA==",
"integrity": "sha1-PH/L9SnYcibz0vUrlm/1Jx60Qd0=",
"requires": {
"balanced-match": "^1.0.0",
"concat-map": "0.0.1"
......@@ -264,7 +264,7 @@
"concat-stream": {
"version": "1.6.2",
"resolved": "https://registry.npmjs.org/concat-stream/-/concat-stream-1.6.2.tgz",
"integrity": "sha512-27HBghJxjiZtIk3Ycvn/4kbJk/1uZuJFfuPEns6LaEvpvG1f0hTea8lilrouyo9mVc2GWdcEZ8OLoGmSADlrCw==",
"integrity": "sha1-kEvfGUzTEi/Gdcd/xKw9T/D9GjQ=",
"requires": {
"buffer-from": "^1.0.0",
"inherits": "^2.0.3",
......@@ -366,7 +366,7 @@
"errno": {
"version": "0.1.7",
"resolved": "https://registry.npmjs.org/errno/-/errno-0.1.7.tgz",
"integrity": "sha512-MfrRBDWzIWifgq6tJj60gkAwtLNb6sQPlcFrSOflcP1aFmmruKQ2wRnze/8V6kgyz7H3FF8Npzv78mZ7XLLflg==",
"integrity": "sha1-RoTXF3mtOa8Xfj8AeZb3xnyFJhg=",
"optional": true,
"requires": {
"prr": "~1.0.1"
......@@ -506,7 +506,7 @@
"function-bind": {
"version": "1.1.1",
"resolved": "https://registry.npmjs.org/function-bind/-/function-bind-1.1.1.tgz",
"integrity": "sha512-yIovAzMX49sF8Yl58fSCWJ5svSLuaibPxXQJFLmBObTuCr0Mf1KiPopGM9NiFjiYBCbfaa2Fh6breQ6ANVTI0A=="
"integrity": "sha1-pWiZ0+o8m6uHS7l3O3xe3pL0iV0="
},
"gaze": {
"version": "1.1.3",
......@@ -835,7 +835,7 @@
"iconv-lite": {
"version": "0.4.23",
"resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.4.23.tgz",
"integrity": "sha512-neyTUVFtahjf0mB3dZT77u+8O0QB89jFdnBkd5P1JgYPbPaia3gXXOVL2fq8VyU2gMMD7SaN7QukTB/pmXYvDA==",
"integrity": "sha1-KXhx9jvlB63Pv8pxXQzQ7thOmmM=",
"requires": {
"safer-buffer": ">= 2.1.2 < 3"
}
......@@ -876,7 +876,7 @@
"is-buffer": {
"version": "1.1.6",
"resolved": "https://registry.npmjs.org/is-buffer/-/is-buffer-1.1.6.tgz",
"integrity": "sha512-NcdALwpXkTm5Zvvbk7owOUSvVvBKDgKP5/ewfXEznmQFfs4ZRmanOeKBTjRVjka3QFoN6XJ+9F3USqfHqTaU5w=="
"integrity": "sha1-76ouqdqg16suoTqXsritUf776L4="
},
"is-builtin-module": {
"version": "1.0.0",
......@@ -1114,7 +1114,7 @@
"minimatch": {
"version": "3.0.4",
"resolved": "https://registry.npmjs.org/minimatch/-/minimatch-3.0.4.tgz",
"integrity": "sha512-yJHVQEhyqPLUTgt9B83PXu6W3rx4MvvHvSUvToogpwoGDOUQ+yDrR0HRot+yOCdCO7u4hX3pWft6kWBBcqh0UA==",
"integrity": "sha1-UWbihkV/AzBgZL5Ul+jbsMPTIIM=",
"requires": {
"brace-expansion": "^1.1.7"
}
......@@ -1297,7 +1297,7 @@
"process-nextick-args": {
"version": "2.0.0",
"resolved": "https://registry.npmjs.org/process-nextick-args/-/process-nextick-args-2.0.0.tgz",
"integrity": "sha512-MtEC1TqN0EU5nephaJ4rAtThHtC86dNN9qCuEhtshvpVBkAW5ZO7BASN9REnF9eoXGcRub+pFuKEpOHE+HbEMw=="
"integrity": "sha1-o31zL0JxtKsa0HDTVQjoKQeI/6o="
},
"promise": {
"version": "7.3.1",
......@@ -1462,7 +1462,7 @@
"safe-buffer": {
"version": "5.1.2",
"resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.1.2.tgz",
"integrity": "sha512-Gd2UZBJDkXlY7GbJxfsE8/nvKkUEU1G38c1siN6QP6a9PT9MmHB8GnpscSmMJSoF8LOIrt8ud/wPtojys4G6+g=="
"integrity": "sha1-mR7GnSluAxN0fVm9/St0XDX4go0="
},
"safe-json-parse": {
"version": "1.0.1",
......@@ -1473,7 +1473,7 @@
"safer-buffer": {
"version": "2.1.2",
"resolved": "https://registry.npmjs.org/safer-buffer/-/safer-buffer-2.1.2.tgz",
"integrity": "sha512-YZo3K82SD7Riyi0E1EQPojLz7kpepnSQI9IyPbHHg1XXXevb5dJI7tpyN2ADxGcQbHG7vcyRHk0cbwqcQriUtg=="
"integrity": "sha1-RPoWGwGHuVSd2Eu5GAL5vYOFzWo="
},
"screenfull": {
"version": "3.3.2",
......@@ -1521,7 +1521,7 @@
"spdx-expression-parse": {
"version": "3.0.0",
"resolved": "https://registry.npmjs.org/spdx-expression-parse/-/spdx-expression-parse-3.0.0.tgz",
"integrity": "sha512-Yg6D3XpRD4kkOmTpdgbUiEJFKghJH03fiC1OPll5h/0sO6neh2jqRDVHOQ4o/LMea0tgCkbMgea5ip/e+MkWyg==",
"integrity": "sha1-meEZt6XaAOBUkcn6M4t5BII7QdA=",
"requires": {
"spdx-exceptions": "^2.1.0",
"spdx-license-ids": "^3.0.0"
......@@ -1736,7 +1736,7 @@
"websocket-extensions": {
"version": "0.1.3",
"resolved": "https://registry.npmjs.org/websocket-extensions/-/websocket-extensions-0.1.3.tgz",
"integrity": "sha512-nqHUnMXmBzT0w570r2JpJxfiSD1IzoI+HGVdd3aZ0yNi3ngvQ4jv1dtHt5VGxfI2yj5yqImPhOK4vmIh2xMbGg==",
"integrity": "sha1-XS/yKXcAPsaHpLhwc9+7rBRszyk=",
"dev": true
},
"which": {
......
#script prenant en entrée les ressources suivantes :
#Dans rep /entries, les fichiers avec nom .+\..+ (pattern $nameLexiconFiles="MM.(adj|adv|int|nom|vaux|verb)")
# Ex : en espagnol MM.(adj|adv|int|nom|vaux|verb)
#+ le tagset.data qui explique les catégories et traits
#en sortie -> lexicon.tsv (le lexique sous la forme word\troot\feat1=value,feat2=value...)
use Encode;
use IO::Handle;
STDOUT->autoflush();
use strict;
use utf8;
use Data::Dumper;
################################## VARIABLE LEXIQUE USER ########################################
my $userLanguage="espagnol";
my $userDescription="ce lexique recouvre les mots ... et les traits";
my $userRelationType="dérivation, racine, etc ... définition par l'user";
################################## VARIABLE LEXIQUE USER ########################################
##################################DEBUT STRUCTURE DE DONNEES ########################################
#######CHEMIN RESSOURCES
my $dirRessource="entries-en";
my $fileTagset="tagset.dat";
my $fileLexiconOut="lexicon.tsv";
my $fileSpecOut="spec.txt";
#my $nameLexiconFiles="lefff\.(adj|adv|int|nom|vaux|verb)";
my $nameLexiconFiles="[^\.]+";
my %cat2features;
# $cat2features{$catLabel}{$featureLabel}{$valuesNLabel[0]}=$valuesNLabel[0]
# $cat2features{"N"}{"type"}{"C"}="common";
my %labels2expand;
# $labels2expand{"N"}="noun";
my %cat2CharCoded;#nombre de caractères sur lesquels est codée la cat
# $cat2CharCoded{"N"}="2";
my %cat2featuresOrder;
# $cat2featuresOrder{catLabel}{$cpt}=$featureLabel;
# $cat2featuresOrder{"N"}{1}="type";
my %wordsLexicon;#$wordsLexicon{"abajaban"} = "abajar cat=V,type=M,mood=I,tense=I,person=3,num=P,gen=0"
##################################FIN STRUCTURE DE DONNEES ########################################
open(LOG,">:encoding(utf8)","loglst.txt");
#Début lecture
################################## DEBUT LECTURE TAGSET ############################
#format
#catégorie mono/polylexical lemme acception_termith avec cat contexte gauche pivot contexte droit source partie textuelle
#acception_termith avec cat -> aboutir_V_2 ou absence_N
#attention, les polylexicaux doivent etre en fin de fichier
open(TAGSET,"<:encoding(utf8)",$dirRessource."/".$fileTagset);
my $inRules=0;
my $line;
while ( $line = <TAGSET>) {
if($line=~/^<\/DecompositionRules>/){
$inRules=0;
}
if($inRules){
if($line=~/^([A-Z]+)\s(\d+)\s(\S+)\s*(.*)/){
my ($catLabel, $charCat, $catExpand, $features)=($1,$2,$3,$4);
$labels2expand{"catégorie"}{$catLabel."=".$catExpand}=1;
$cat2CharCoded{$catLabel}=$charCat;
print LOG "$catLabel = $catExpand codé sur $charCat et traits : $features \n";
my @featuresArray=split(" ",$features);
my $cpt=0;
foreach my $featureCase (@featuresArray){
print LOG "FEATURE = $featureCase \n";
if($featureCase=~/(.+)\/(.+)/){
my $featureLabel=$1;
my $featureValuesString=$2;
$cat2featuresOrder{$catLabel}{$cpt}=$featureLabel;
my @featureValuesArray=split(";",$featureValuesString);
foreach my $featureValue (@featureValuesArray){
my @valuesNLabel=split(":",$featureValue);
$labels2expand{$featureLabel}{$valuesNLabel[0]."=".$valuesNLabel[1]}=1;
print LOG "$featureLabel = ".$valuesNLabel[0]." expand by ".$valuesNLabel[1]." \n";
$cat2features{$catLabel}{$featureLabel}{$valuesNLabel[0]}=$valuesNLabel[0];
}
}
else{
print LOG "BLEME RULES feature $featureCase \n";
}
$cpt++;
}
}
else{
print LOG "BLEME RULES EX $line \n";
}
}
if($line=~/^<DecompositionRules>/){
$inRules=1;
}
}
close(TAGSET);
################################## FIN LECTURE TAGSET ############################
################################## DEBUT LECTURE LEXIQUE ENTRIES #######################################
opendir(LEXICONS,$dirRessource);
while(my $fileLexicon = readdir(LEXICONS)){
if($fileLexicon=~/^$nameLexiconFiles$/){
open(LEXICON,"<:encoding(utf8)",$dirRessource."/".$fileLexicon);
print LOG "LECTURE $fileLexicon \n";
while ($line = <LEXICON>) {
if($line=~/^([^\s]+)\s([^\s]+)\s([^\s]+)/){
my ($word, $root, $features)=($1,$2,$3);
my @featuresLetter=split("",$features);
my $cat = $featuresLetter[0];
my $featuresString ="cat=".$cat.",";
#on récupère les features dans l'ordre donné par $cat2featuresOrder{$catLabel}
for my $i (0 .. $#featuresLetter-1) {
$featuresString .=$cat2featuresOrder{$cat}{$i}."=".$featuresLetter[$i+1].",";
}
chop($featuresString);
$wordsLexicon{$word}=$root."\t".$featuresString;
}
else{
print LOG "BLEME LECTURE LINE $line \n";
}
}
close(LEXICON);
}
}
closedir(LEXICONS);
################################## FIN LECTURE LEXIQUE ENTRIES #######################################
################################## DEBUT ECRITURE LEXIQUE TSV #######################################
open(LEXICONTSV,">:encoding(utf8)",$dirRessource."/".$fileLexiconOut);
my @keysWords=sort { $a cmp $b } keys(%wordsLexicon);
foreach my $keyWords (@keysWords){
print LEXICONTSV $keyWords."\t".$wordsLexicon{$keyWords}."\n";
}
close(LEXICONTSV);
################################## FIN ECRITURE LEXIQUE TSV #######################################
################################## DEBUT ECRITURE SPEC TXT #######################################
# DESCR:value(cat):A=adjectif
open(LEXICONTXT,">:encoding(utf8)",$dirRessource."/".$fileSpecOut);
@keysWords=sort { $a cmp $b } keys(%wordsLexicon);
print LEXICONTXT "LANGUAGE=".$userLanguage."\n";
print LEXICONTXT "RELATIONTYPE=".$userRelationType."\n";
print LEXICONTXT "DESCRIPTION=".$userDescription."\n";
print LEXICONTXT "DESCR:label:cat=catégorie\n";
my @keysExpand = sort { $a cmp $b } keys(%labels2expand);
foreach my $keyExpand (@keysExpand){
my @keysValuesExpand = sort { $a cmp $b } keys(%{$labels2expand{$keyExpand}});
foreach my $keyValueExpand (@keysValuesExpand){
print LEXICONTXT "DESCR:value(".$keyExpand."):".$keyValueExpand."\n";
}
}
close(LEXICONTXT);
################################## FIN ECRITURE SPEC TXT #######################################
close(LOG);
\ No newline at end of file
#script prenant en entrée les ressources suivantes :
#le fichier xml fileLexiconIn
#en sortie -> lexicon.tsv ($fileLexiconOut)
use Encode;
use IO::Handle;
STDOUT->autoflush();
use strict;
use utf8;
use Data::Dumper;
################################## VARIABLE LEXIQUE USER ########################################
my $userLanguage="frenchM2";
################################## VARIABLE LEXIQUE USER ########################################
##################################DEBUT STRUCTURE DE DONNEES ########################################
#######CHEMIN RESSOURCES
my $fileLexiconIn="Morphalou-2.0.xml";
# my $fileLexiconIn="extrait.xml";
my $fileLexiconOut="lexicon.tsv";
my %roots2features;#$roots2features{"id"}=feat1=value1,feat2=value2;
my %roots2lemma;#$roots2lemma{"id"}=$elmma;
my %wordsLexicon;#$wordsLexicon{"id"}{} = "abajar cat=V,type=M,mood=I,tense=I,person=3,num=P,gen=0"
##################################FIN STRUCTURE DE DONNEES ########################################
open(LOG,">:encoding(utf8)","loglst.txt");
#Début lecture
my $inRules=0;
my $line;
################################## DEBUT LECTURE LEXIQUE ENTRIES #######################################
open(LEXICONTSV,">:encoding(utf8)",$fileLexiconOut);
open(LEXICON,"<:encoding(utf8)",$fileLexiconIn);
my $currentID="";
my $currentForm="";
my $inLE=0;
my $inLF=0;
my $inIF=0;
while ($line = <LEXICON>) {
if($line=~/^\s*<lexicalEntry id="(.+)">/){
$currentID=$1;
$inLE=1;
$inIF=0;
print LOG "Passage LA $currentID \n";
}
elsif($line=~/^\s*<\/lexicalEntry>/){
$inLE=0;
#si pas de IF alors on enregistre le root comme word
if(!$inIF){
$wordsLexicon{$currentID}{$currentForm}=$roots2features{$currentID};
print LEXICONTSV $currentForm."\t";
print LEXICONTSV $roots2lemma{$currentID}."\t";
print LEXICONTSV $wordsLexicon{$currentID}{$currentForm}."\n";
}
else{
my @inflectedForms=keys(%{$wordsLexicon{$currentID}});
foreach my $inflectedForm (@inflectedForms) {
$wordsLexicon{$currentID}{$inflectedForm}.=",".$roots2features{$currentID};
print LOG "Passage LU $currentForm \n";
print LEXICONTSV $inflectedForm."\t";
print LEXICONTSV $roots2lemma{$currentID}."\t";
print LEXICONTSV $wordsLexicon{$currentID}{$inflectedForm}."\n";
}
}
undef %wordsLexicon;
undef %roots2lemma;
undef %roots2features;
}
elsif($line=~/^\s*<lemmatizedForm>/){
$inLF=1;
}
elsif($line=~/^\s*<\/lemmatizedForm>/){
$inIF=0;
}
elsif($line=~/^\s*<inflectedForm>/){
$inIF=1;
}
elsif($line=~/^\s*<orthography[^<>]*>(.+)</){
$currentForm=$1;
print LOG "Passage LO $currentForm \n";
#$wordsLexicon{currentID}{$currentForm}="";
if(!$inIF){
$roots2lemma{$currentID}=$currentForm;
}
}
elsif($line=~/^\s*<(grammatical[^<>]+)>(.+)</){
my $feature=$1;
my $value=$2;
if($inIF){
if(exists($wordsLexicon{$currentID}{$currentForm})){
$wordsLexicon{$currentID}{$currentForm}.=",".$feature."=".$value;
}
else{
$wordsLexicon{$currentID}{$currentForm}=$feature."=".$value;
}
}
elsif($inLF){
if(exists($roots2features{$currentID})){
$roots2features{$currentID}.=",".$feature."=".$value;
}
else{
$roots2features{$currentID}=$feature."=".$value;
}
}
}
}
close(LEXICON);
################################## FIN LECTURE LEXIQUE ENTRIES #######################################
close(LEXICONTSV);
################################## FIN ECRITURE LEXIQUE TSV #######################################
close(LOG);
\ No newline at end of file
#script prenant en entrée les ressources suivantes :
#Dans rep /lexiques ($dirRessource), les fichiers avec nom .+\..+ (pattern $nameLexiconFiles=
use Encode;
use IO::Handle;
STDOUT->autoflush();
use strict;
use utf8;
use Data::Dumper;
################################## VARIABLE LEXIQUE USER ########################################
my $userLanguage="frenchM3";
################################## VARIABLE LEXIQUE USER ########################################
##################################DEBUT STRUCTURE DE DONNEES ########################################
my $dirRessource="lexiques";
my $nameLexiconFiles="[^\.]+";
#######CHEMIN RESSOURCES
my $fileLexiconIn="Morphalou-2.0.xml";
# my $fileLexiconIn="extrait.xml";
my $fileLexiconOut="lexicon.tsv";
my %roots2features;#$roots2features{"id"}=feat1=value1,feat2=value2;
my %roots2lemma;#$roots2lemma{"id"}=$elmma;
my %wordsLexicon;#$wordsLexicon{"id"}{} = "abajar cat=V,type=M,mood=I,tense=I,person=3,num=P,gen=0"
##################################FIN STRUCTURE DE DONNEES ########################################
open(LOG,">:encoding(utf8)","loglst.txt");
#Début lecture
my $inRules=0;
my $line;
################################## DEBUT LECTURE LEXIQUE ENTRIES #######################################
open(LEXICONTSV,">:encoding(utf8)",$fileLexiconOut);
opendir(LEXICONS,$dirRessource);
while(my $fileLexicon = readdir(LEXICONS)){
if($fileLexicon=~/$nameLexiconFiles$/){
open(LEXICON,"<:encoding(utf8)",$dirRessource."/".$fileLexicon);
my $currentID="";
my $currentForm="";
my $inLE=0;
my $inLF=0;
my $inIF=0;
while ($line = <LEXICON>) {
if($line=~/^\s*<lexicalEntry id="(.+)">/){
$currentID=$1;
$inLE=1;
$inIF=0;
print LOG "Passage LA $currentID \n";
}
elsif($line=~/^\s*<\/lexicalEntry>/){
$inLE=0;
my @inflectedForms=keys(%{$wordsLexicon{$currentID}});
foreach my $inflectedForm (@inflectedForms) {
$wordsLexicon{$currentID}{$inflectedForm}.=",".$roots2features{$currentID};
print LOG "Passage LU $currentForm \n";
print LEXICONTSV $inflectedForm."\t";
print LEXICONTSV $roots2lemma{$currentID}."\t";
print LEXICONTSV $wordsLexicon{$currentID}{$inflectedForm}."\n";
}
undef %wordsLexicon;
undef %roots2lemma;
undef %roots2features;
}
elsif($line=~/^\s*<lemmatizedForm>/){
$inLF=1;
}
elsif($line=~/^\s*<\/lemmatizedForm>/){
$inIF=0;
}
elsif($line=~/^\s*<inflectedForm>/){
$inIF=1;
}
elsif($line=~/^\s*<orthography[^<>]*>(.+)</){
$currentForm=$1;
print LOG "Passage LO $currentForm \n";
#$wordsLexicon{currentID}{$currentForm}="";
if(!$inIF){
$roots2lemma{$currentID}=$currentForm;
}
}
elsif($line=~/^\s*<(grammatical[^<>]+)>(.+)</){
my $feature=$1;
my $value=$2;
if($inIF){
if(exists($wordsLexicon{$currentID}{$currentForm})){
$wordsLexicon{$currentID}{$currentForm}.=",".$feature."=".$value;
}
else{
$wordsLexicon{$currentID}{$currentForm}=$feature."=".$value;
}
}
elsif($inLF){
if(exists($roots2features{$currentID})){
$roots2features{$currentID}.=",".$feature."=".$value;
}
else{
$roots2features{$currentID}=$feature."=".$value;
}
}
}
}
close(LEXICON);
}
}
closedir(LEXICONS);
################################## FIN LECTURE LEXIQUE ENTRIES #######################################
################################## DEBUT ECRITURE LEXIQUE TSV #######################################
close(LEXICONTSV);
################################## FIN ECRITURE LEXIQUE TSV #######################################
close(LOG);
\ No newline at end of file
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment