Vous avez reçu un message "Your GitLab account has been locked ..." ? Pas d'inquiétude : lisez cet article https://docs.gricad-pages.univ-grenoble-alpes.fr/help/unlock/

Commit 3bc9963e authored by Sylvain Hatier's avatar Sylvain Hatier Committed by Arnaud Bey
Browse files

script lexique anglais dela To tsv

parent ee986ea0
#script prenant en entrée les ressources suivantes :
#le fichier xml fileLexiconIn
#en sortie -> lexicon.tsv ($fileLexiconOut)
use Encode;
use IO::Handle;
STDOUT->autoflush();
use strict;
use utf8;
use Data::Dumper;
################################## VARIABLE LEXIQUE USER ########################################
my $userLanguage="frenchM2";
################################## VARIABLE LEXIQUE USER ########################################
##################################DEBUT STRUCTURE DE DONNEES ########################################
#######CHEMIN RESSOURCES
my $fileLexiconIn="dela-en-public-u8.dic.xml";
# my $fileLexiconIn="extrait.xml";
my $fileLexiconOut="lexicon-dela.tsv";
my %roots2features;#$roots2features{"id"}=feat1=value1,feat2=value2;
my %roots2lemma;#$roots2lemma{"id"}=$elmma;
my %wordsLexicon;#$wordsLexicon{"id"}{} = "abajar cat=V,type=M,mood=I,tense=I,person=3,num=P,gen=0"
##################################FIN STRUCTURE DE DONNEES ########################################
open(LOG,">:encoding(utf8)","loglst.txt");
#Début lecture
my $inRules=0;
my $line;
################################## DEBUT LECTURE LEXIQUE ENTRIES #######################################
open(LEXICONTSV,">:encoding(utf8)",$fileLexiconOut);
open(LEXICON,"<:encoding(utf8)",$fileLexiconIn);
my $currentID="";
my $currentForm="";
my $inLE=0;
my $inLF=0;
my $inIF=0;
my $cptID=0;
# while ( ($line = <LEXICON>) && ($cptID <= 1000) ) {
while ( ($line = <LEXICON>) ) {
if($line=~/^\s*<entry>/){
$cptID++;
$currentID=$cptID;
$inLE=1;
$inIF=0;
}
elsif($line=~/^\s*<\/entry>/){
$inLE=0;
my @inflectedForms=keys(%{$wordsLexicon{$currentID}});
foreach my $inflectedForm (@inflectedForms) {
$wordsLexicon{$currentID}{$inflectedForm}.=",".$roots2features{$currentID};
print LEXICONTSV $inflectedForm."\t";
print LEXICONTSV $roots2lemma{$currentID}."\t";
print LEXICONTSV $wordsLexicon{$currentID}{$inflectedForm}."\n";
}
undef %wordsLexicon;
undef %roots2lemma;
undef %roots2features;
}
elsif($line=~/^\s*<lemma>([^<>]+)</){
$inLF=1;
$currentForm=$1;
$roots2lemma{$currentID}=$currentForm;
}
elsif($line=~/^\s*<\/inflected>/){
$inIF=0;
}
elsif($line=~/^\s*<inflected>/){
$inIF=1;
$inLF=0;
}
elsif($line=~/^\s*<form[^<>]*>(.+)</){
$currentForm=$1;
}
elsif($line=~/^\s*<pos name='([^']+)'/){
my $catValue=$1;
if(exists($roots2features{$currentID})){
$roots2features{$currentID}.=",pos=".$catValue;
}
else{
$roots2features{$currentID}="pos=".$catValue;
}
}
elsif($line=~/^\s*<feat name='([^']+)' value='([^']+)'/){
my $feature=$1;
my $value=$2;
if($inIF){
if(exists($wordsLexicon{$currentID}{$currentForm})){
$wordsLexicon{$currentID}{$currentForm}.=",".$feature."=".$value;
}
else{
$wordsLexicon{$currentID}{$currentForm}=$feature."=".$value;
}
}
elsif($inLF){
if(exists($roots2features{$currentID})){
$roots2features{$currentID}.=",".$feature."=".$value;
}
else{
$roots2features{$currentID}=$feature."=".$value;
}
}
}
}
close(LEXICON);
################################## FIN LECTURE LEXIQUE ENTRIES #######################################
close(LEXICONTSV);
################################## FIN ECRITURE LEXIQUE TSV #######################################
close(LOG);
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment