Commit b491604e authored by hatiers's avatar hatiers
Browse files

More lexicons specific script

parent bbaf7ece
#script prenant en entrée les ressources suivantes :
#$fileLexiconIn
use Encode;
use IO::Handle;
STDOUT->autoflush();
use strict;
use utf8;
use Data::Dumper;
use locale;
################################## VARIABLE LEXIQUE USER ########################################
##################################DEBUT STRUCTURE DE DONNEES ########################################
my $dirRessource="lexiques";
my $nameLexiconFiles="[^\.]+";
#######CHEMIN RESSOURCES
my $fileLexiconIn="lexicon-demi.tsv";
# my $fileLexiconIn="extrait.xml";
my $fileLexiconOut="lexicon-quart.tsv";
open(LEXICON,"<:encoding(utf8)",$fileLexiconIn);
my @lines=<LEXICON>;
close(LEXICON);
open(LEXICONTSV,">:encoding(utf8)",$fileLexiconOut);
for (my $var = int($#lines/4); $var < $#lines; $var++) {
print LEXICONTSV $lines[$var];
}
close(LEXICONTSV);
################################## FIN ECRITURE LEXIQUE TSV #######################################
\ No newline at end of file
#script listant les char non-alphabétiques de $fileLexicon
use Encode;
use IO::Handle;
STDOUT->autoflush();
use strict;
use utf8;
use Data::Dumper;
##################################DEBUT STRUCTURE DE DONNEES ########################################
#######CHEMIN RESSOURCES
my $dirRessource="entries-ru";
my $fileLexicon="lexicon.tsv";
my %oddchar;
##################################FIN STRUCTURE DE DONNEES ########################################
open(LOG,">:encoding(utf8)","log-tsv.txt");
################################## DEBUT LECTURE LEXIQUE ENTRIES #######################################
open(LEXICON,"<:encoding(utf8)",$dirRessource."/".$fileLexicon);
print LOG "LECTURE $fileLexicon \n";
while (my $line = <LEXICON>) {
if($line=~/^([^\t]+)\t/){
my @chars=split("",$1);
foreach my $char (@chars){
if( ($char=~/\P{L}/) && (!exists($oddchar{$char})) ){
$oddchar{$char}=1;
print LOG "ONE -> ".$char."\n";
}
}
}
}
close(LEXICON);
################################## FIN LECTURE LEXIQUE ENTRIES #######################################
close(LOG);
\ No newline at end of file
#script prenant en entrée les ressources suivantes :
#Dans rep /entries, les fichiers avec nom .+\..+ (pattern $nameLexiconFiles="MM.(adj|adv|int|nom|vaux|verb)")
# Ex : en espagnol MM.(adj|adv|int|nom|vaux|verb)
#+ le tagset.data qui explique les catégories et traits
#en sortie -> lexicon.tsv (le lexique sous la forme word\troot\feat1=value,feat2=value...)
use Encode;
use IO::Handle;
STDOUT->autoflush();
use strict;
use utf8;
use Data::Dumper;
################################## VARIABLE LEXIQUE USER ########################################
my $userLanguage="french";
my $userDescription="ce lexique recouvre les mots ... et les traits";
my $userRelationType="dérivation, racine, etc ... définition par l'user";
################################## VARIABLE LEXIQUE USER ########################################
##################################DEBUT STRUCTURE DE DONNEES ########################################
#######CHEMIN RESSOURCES
my $dirRessource="entries-en";
my $fileTagset="tagset.dat";
my $useDirectTranslation=1;#1 if DecompositionRules aren't used for example for english data of freeling
my %directTranslation;#key = VBZ / value = 'pos=verb|vform=personal|person=3'
my $fileLexiconOut="lexicon.tsv";
my $fileSpecOut="spec.txt";
#my $nameLexiconFiles="lefff\.(adj|adv|int|nom|vaux|verb)";
my $nameLexiconFiles="[^\.]+";
my %cat2features;
# $cat2features{$catLabel}{$featureLabel}{$valuesNLabel[0]}=$valuesNLabel[0]
# $cat2features{"N"}{"type"}{"C"}="common";
my %labels2expand;
# $labels2expand{"N"}="noun";
my %cat2CharCoded;#nombre de caractères sur lesquels est codée la cat
# $cat2CharCoded{"N"}="2";
my %cat2featuresOrder;
# $cat2featuresOrder{catLabel}{$cpt}=$featureLabel;
# $cat2featuresOrder{"N"}{1}="type";
my %wordsLexicon;#$wordsLexicon{"abajaban"} = "abajar cat=V,type=M,mood=I,tense=I,person=3,num=P,gen=0"
##################################FIN STRUCTURE DE DONNEES ########################################
open(LOG,">:encoding(utf8)","loglst.txt");
#Début lecture
################################## DEBUT LECTURE TAGSET ############################
#TODO ajout lecture directTranslation
open(TAGSET,"<:encoding(utf8)",$dirRessource."/".$fileTagset);
my $inRules=0;
my $line;
my $inDirectTranslation=0;
my $stringDirectTranslation="";
while ( $line = <TAGSET>) {
if($line=~/^<\/DecompositionRules>/){
$inRules=0;
}
if($line=~/^<\/DirectTranslations>/){
$inDirectTranslation=0;
}
if($inDirectTranslation){
$stringDirectTranslation.=$line;
}
if($inRules){
if($line=~/^([A-Z]+)\s(\d+)\s(\S+)\s*(.*)/){
my ($catLabel, $charCat, $catExpand, $features)=($1,$2,$3,$4);
$labels2expand{"catégorie"}{$catLabel."=".$catExpand}=1;
$cat2CharCoded{$catLabel}=$charCat;
print LOG "$catLabel = $catExpand codé sur $charCat et traits : $features \n";
my @featuresArray=split(" ",$features);
my $cpt=0;
foreach my $featureCase (@featuresArray){
print LOG "FEATURE = $featureCase \n";
if($featureCase=~/(.+)\/(.+)/){
my $featureLabel=$1;
my $featureValuesString=$2;
$cat2featuresOrder{$catLabel}{$cpt}=$featureLabel;
my @featureValuesArray=split(";",$featureValuesString);
foreach my $featureValue (@featureValuesArray){
my @valuesNLabel=split(":",$featureValue);
$labels2expand{$featureLabel}{$valuesNLabel[0]."=".$valuesNLabel[1]}=1;
print LOG "$featureLabel = ".$valuesNLabel[0]." expand by ".$valuesNLabel[1]." \n";
$cat2features{$catLabel}{$featureLabel}{$valuesNLabel[0]}=$valuesNLabel[0];
}
}
else{
print LOG "BLEME RULES feature $featureCase \n";
}
$cpt++;
}
}
else{
print LOG "BLEME RULES EX $line \n";
}
}
if($line=~/^<DecompositionRules>/){
$inRules=1;
}
if($line=~/^<DirectTranslations>/){
$inDirectTranslation=1;
}
}
close(TAGSET);
if($useDirectTranslation){
my @arrayDirectTranslation=split("\n",$stringDirectTranslation);
foreach my $lineDirectTranslation (@arrayDirectTranslation){
if($lineDirectTranslation=~/^(.+)\s(.+)\s(.+)\s*$/){
my $label=$2;
my $features=$3;
$features=~s/\|/,/g;
$directTranslation{$label}=$features;
}
}
}
################################## FIN LECTURE TAGSET ############################
################################## DEBUT LECTURE LEXIQUE ENTRIES #######################################
opendir(LEXICONS,$dirRessource);
while(my $fileLexicon = readdir(LEXICONS)){
if($fileLexicon=~/^$nameLexiconFiles$/){
open(LEXICON,"<:encoding(utf8)",$dirRessource."/".$fileLexicon);
print LOG "LECTURE $fileLexicon \n";
while ($line = <LEXICON>) {
if($line=~/^([^\s]+)\s([^\s]+)\s([^\s]+)/){
my ($word, $root, $features)=($1,$2,$3);
my @featuresLetter=split("",$features);
my $cat = $featuresLetter[0];
my $featuresString ="cat=".$cat.",";
if(!$useDirectTranslation){
#on récupère les features dans l'ordre donné par $cat2featuresOrder{$catLabel}
for my $i (0 .. $#featuresLetter-1) {
$featuresString .=$cat2featuresOrder{$cat}{$i}."=".$featuresLetter[$i+1].",";
}
chop($featuresString);
$wordsLexicon{$word."\t".$root."\t".$featuresString}=1;
}
else{
$wordsLexicon{$word."\t".$root."\t".$directTranslation{$features}}=1;
}
}
else{
print LOG "BLEME LECTURE LINE $line \n";
}
}
close(LEXICON);
}
}
closedir(LEXICONS);
################################## FIN LECTURE LEXIQUE ENTRIES #######################################
################################## DEBUT ECRITURE LEXIQUE TSV #######################################
open(LEXICONTSV,">:encoding(utf8)",$dirRessource."/".$fileLexiconOut);
my @keysWords=sort { $a cmp $b } keys(%wordsLexicon);
foreach my $keyWords (@keysWords){
print LEXICONTSV $keyWords."\n";
}
close(LEXICONTSV);
################################## FIN ECRITURE LEXIQUE TSV #######################################
################################## DEBUT ECRITURE SPEC TXT #######################################
# DESCR:value(cat):A=adjectif
open(LEXICONTXT,">:encoding(utf8)",$dirRessource."/".$fileSpecOut);
@keysWords=sort { $a cmp $b } keys(%wordsLexicon);
print LEXICONTXT "LANGUAGE=".$userLanguage."\n";
print LEXICONTXT "RELATIONTYPE=".$userRelationType."\n";
print LEXICONTXT "DESCRIPTION=".$userDescription."\n";
print LEXICONTXT "DESCR:label:cat=catégorie\n";
my @keysExpand = sort { $a cmp $b } keys(%labels2expand);
foreach my $keyExpand (@keysExpand){
my @keysValuesExpand = sort { $a cmp $b } keys(%{$labels2expand{$keyExpand}});
foreach my $keyValueExpand (@keysValuesExpand){
print LEXICONTXT "DESCR:value(".$keyExpand."):".$keyValueExpand."\n";
}
}
close(LEXICONTXT);
################################## FIN ECRITURE SPEC TXT #######################################
close(LOG);
\ No newline at end of file
#script prenant en entrée les ressources suivantes :
#Dans rep /entries, les fichiers avec nom .+\..+ (pattern $nameLexiconFiles="MM.(adj|adv|int|nom|vaux|verb)")
# Ex : en espagnol MM.(adj|adv|int|nom|vaux|verb)
#+ le tagset.data qui explique les catégories et traits
#en sortie -> lexicon.tsv (le lexique sous la forme word\troot\feat1=value,feat2=value...)
use Encode;
use IO::Handle;
STDOUT->autoflush();
use strict;
use utf8;
use Data::Dumper;
################################## VARIABLE LEXIQUE USER ########################################
my $userLanguage="frenchM";
my $userDescription="ce lexique recouvre les mots ... et les traits";
my $userRelationType="dérivation, racine, etc ... définition par l'user";
################################## VARIABLE LEXIQUE USER ########################################
##################################DEBUT STRUCTURE DE DONNEES ########################################
#######CHEMIN RESSOURCES
my $fileLexiconIn="lexiques/commonNoun_Morphalou3.1_LMF.xml";
my $fileLexiconOut="lexicon.tsv";
my $fileSpecOut="spec.txt";
#my $nameLexiconFiles="lefff\.(adj|adv|int|nom|vaux|verb)";
my $nameLexiconFiles="[^\.]+";
my %wordsLexicon;#$wordsLexicon{"abajaban"} = "abajar cat=V,type=M,mood=I,tense=I,person=3,num=P,gen=0"
##################################FIN STRUCTURE DE DONNEES ########################################
open(LOG,">:encoding(utf8)","loglst.txt");
#Début lecture
my $line;
my %tags;
################################## DEBUT LECTURE LEXIQUE ENTRIES #######################################
open(LEXICON,"<:encoding(utf8)",$fileLexiconIn);
my $currentID="";
my $inLE=0;
my $inLF=0;
while ($line = <LEXICON>) {
if($line=~/^\s*<([^\s<>]+)[^><]*>/){
$tags{$1}=1;
}
}
close(LEXICON);
################################## FIN LECTURE LEXIQUE ENTRIES #######################################
################################## DEBUT ECRITURE LEXIQUE TSV #######################################
open(LEXICONTSV,">:encoding(utf8)","tagsMorphalouN.txt");
my @tagsstring=sort { $a cmp $b } keys(%tags);
foreach my $tagstring (@tagsstring){
print LEXICONTSV $tagstring."\n";
}
close(LEXICONTSV);
################################## FIN ECRITURE LEXIQUE TSV #######################################
close(LOG);
\ No newline at end of file
#script prenant en entrée les ressources suivantes :
#$fileLexiconIn
use Encode;
use IO::Handle;
STDOUT->autoflush();
use strict;
use utf8;
use Data::Dumper;
use locale;
################################## VARIABLE LEXIQUE USER ########################################
##################################DEBUT STRUCTURE DE DONNEES ########################################
my $dirRessource="lexiques";
my $nameLexiconFiles="[^\.]+";
#######CHEMIN RESSOURCES
my $fileLexiconIn="lexicon.tsv";
# my $fileLexiconIn="extrait.xml";
my $fileLexiconOut="lexicon-sort.tsv";
open(LEXICON,"<:encoding(utf8)",$fileLexiconIn);
my @lines=<LEXICON>;
close(LEXICON);
open(LEXICONTSV,">:encoding(utf8)",$fileLexiconOut);
print LEXICONTSV join("",sort(@lines));
close(LEXICONTSV);
################################## FIN ECRITURE LEXIQUE TSV #######################################
use Encode;
use IO::Handle;
STDOUT->autoflush();
use strict;
use utf8;
use Data::Dumper;
use Text::Levenshtein qw(distance);
use Data::Dumper;
use experimental 'smartmatch';
use locale;
my $fileNBA = "Seasons_Stats.csv";
my $fileLexicon = "lexicon.tsv";
open( NBA, "<:encoding(utf8)", $fileNBA );
my @linesNBA=<NBA>;
close(NBA);
open( LEXICON, ">:encoding(utf8)", $fileLexicon );
my $cpt=0;
foreach my $lineNBA (@linesNBA){
if($cpt != 0){
if($lineNBA=~/^[^,]*,([^,]*),([^,]+),([^,]*),([^,]*),([^,]*),/){
my ($year, $names, $post, $age, $team) = ($1,$2,$3,$4,$5);
if($names =~ /^(.+) (.+)$/ ){
my ($firstame, $surname) = ($1,$2);
print LEXICON $firstame."\t".$team."-".$year."\tage=".$age."post=".$post."\n";
print LEXICON $surname."\t".$team."-".$year."\tage=".$age."post=".$post."\n";
}
else{
print "Name ? $names dans $lineNBA \n";
}
}
}
$cpt++;
}
close LEXICON;
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment