Commit b99bdc25 authored by Sylvain Coulange's avatar Sylvain Coulange
Browse files

tests analyse morpho fr

parent 187964fd
......@@ -2,7 +2,7 @@ print("Import des librairies et chargement du modèle de langue...")
from bs4 import BeautifulSoup as bs
import re, spacy
# nlp = spacy.load("fr_core_news_md")
# nlp = spacy.load("fr_core_news_sm")
print("Chargement des fichiers...")
......@@ -33,43 +33,53 @@ xrPliste = xr.find_all('P')
for x,p in enumerate(xrPliste):
# RECUPERER LA VERSION INITIALE ET LA VERSION FINALE
xrPini = re.sub(r'<edit auteur="[^"]*" categorie="correction" comment="[^"]*" date="[^"]*" final="[^"]*" idodf="[^"]*" initial="([^"]*)" resolu="1" tags="" type="[^"]*"/>', r'°\1*', str(xrPliste[x]))
xrPfin = re.sub(r'<edit auteur="[^"]*" categorie="correction" comment="[^"]*" date="[^"]*" final="([^"]*)" idodf="[^"]*" initial="[^"]*" resolu="1" tags="" type="[^"]*"/>', r'°\1*', str(xrPliste[x]))
xrPini = re.sub(r'<edit auteur="[^"]*" categorie="correction" comment="[^"]*" date="[^"]*" final="[^"]*" idodf="([^"]*)" initial="([^"]*)" resolu="1" tags="" type="[^"]*"/>', r{\1}\2*', str(xrPliste[x]))
xrPfin = re.sub(r'<edit auteur="[^"]*" categorie="correction" comment="[^"]*" date="[^"]*" final="([^"]*)" idodf="([^"]*)" initial="[^"]*" resolu="1" tags="" type="[^"]*"/>', r{\2}\1*', str(xrPliste[x]))
xrPini = re.sub(r'<P id="\w+"/?>','', xrPini)
xrPini = re.sub(r'</P>','', xrPini)
xrPfin = re.sub(r'<P id="\w+"/?>','', xrPfin)
xrPfin = re.sub(r'</P>','', xrPfin)
# nlpIni = nlp(xrPini)
# nlpFin = nlp(xrPfin)
def getListEditPos(xrp):
listEditPos = []
cptCar = 0
for car in xrp:
for x, car in enumerate(xrp):
if car == "°":
listEditPos.append({
"debut":cptCar,
"fin":""
})
xrp = xrp[:cptCar] + xrp[cptCar+1:] # suppression de l'ancre
"idodf": xrp[x+2:x+18],
"debut": cptCar,
"fin": None
})
xrp = xrp[:cptCar] + xrp[cptCar+19:] # suppression de l'ancre et de l'identifiant
elif car == "*":
listEditPos[-1]["fin"] = cptCar
xrp = xrp[:cptCar] + xrp[cptCar+1:] # suppression de l'ancre
else:
elif car != " ":
cptCar += 1
return listEditPos
return listEditPos, xrp
print('\nP',x)
print('\t', xrPini)
listEditPos = getListEditPos(xrPini)
listEditPos, xrp = getListEditPos(xrPini)
print('\t', xrp)
for e in listEditPos:
print(e)
s = ""
sf = 0
for ed in listEditPos:
for i in range(sf,ed['debut']):
s += ' '
sf =
print('\t', xrPfin)
# cpt = 0
# for token in xrPini:
# # on incrémente à chaque caractère sauf ' '
# for car in token.text:
# if cpt >= listEditPos[0]['debut'] and cpt <= listEditPos[0]['fin']:
# cpt += 1 if car != ' '
# print('\t', xrPfin)
"""
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment