Coverage for src/ptf_tools/doi.py: 21%
253 statements
« prev ^ index » next coverage.py v7.7.0, created at 2025-04-03 12:11 +0000
« prev ^ index » next coverage.py v7.7.0, created at 2025-04-03 12:11 +0000
1import os
2from datetime import datetime
4import requests
5from django.conf import settings
6from django.core.exceptions import ObjectDoesNotExist
7from django.template.loader import render_to_string
8from lxml import etree
9from ptf.cmds.xml import xml_utils
10from ptf.display.resolver import find_id_type
12from mersenne_tools.models import DOIBatch
15def get_doibatch(resource):
16 doibatch = None
17 try:
18 doibatch = resource.doibatch
19 except ObjectDoesNotExist:
20 pass
22 return doibatch
25def get_or_create_doibatch(resource):
26 """
27 @param models.Resource:
28 @return: new or updated doibatch
29 """
31 update_doi = False
32 # check DOI
33 url = settings.DOI_BASE_URL + resource.doi
34 r = requests.get(url, allow_redirects=False)
35 if r.status_code == 302 and resource.get_url_absolute() == r.headers["Location"]:
36 status = "Enregistré"
37 log = "Vérifié sur CROSSREF"
38 update_doi = True
39 elif r.status_code == 302 and resource.get_url_absolute() != r.headers["Location"]:
40 status = "Erreur"
41 log = "Mauvaise URL pour le DOI !!!/à réenregistrer"
42 update_doi = True
43 doibatch = get_doibatch(resource)
44 if update_doi:
45 if doibatch:
46 doibatch.status = status
47 doibatch.log = log
48 else:
49 doibatch = DOIBatch(resource=resource, status=status, log=log)
50 doibatch.save()
51 return doibatch
53 # si on est dans le cas d'un book-part vu que l'enregistrement se fait niveau container, on ne peut pas interroger le batch
54 # lié au book-part, car il a été créé juste pour afficher "En cours" sur le niveau book-part
55 if (
56 doibatch
57 and resource.classname == "Article"
58 and resource.my_container.ctype.startswith("book")
59 ):
60 doibatch.delete()
61 doibatch = None
63 if doibatch:
64 doibatch = checkDOIBatch(doibatch)
66 return doibatch
69# recordDOI par resource (article)
70# problématique liée à l'enregistrement des DOI chez CROSSREF :
71# - pour enregistrer un DOI, on utilise le DOI du journal comme référence : CROSSREF prend ça comme une demande d'enregistrement/modification !
72# du DOI du journal...
73# ce qui se passe lorsque l'on envoie plusieurs requêtes les unes à la suite des autres (Record all DOIs), c'est que l'ordre de traitement est
74# différent (aléatoire) de l'ordre d'envoi et on obtient ces erreurs :
75# "Record not processed because submitted version: 201810150907372216 is less or equal to previously submitted version {1}"
76# ( MAIS le record impliqué ici est celui du journal, celui de l'article ne pose globalement pas de pb)
77# car il y a un timestamp dans chaque requête
78#
79# pour contrer ces erreurs (avant on ne diagnostiquait que le nombre de failure_count et donc il y en avait une) il faut interpréter le xml de retour ::
80# <record_diagnostic status="Success">
81# <doi>10.5802/alco.21</doi>
82# <msg>Successfully updated</msg>
83# C'est ce qui est retenu (dans checkDOIBatch).
84#
87def recordDOI(resource, testing=False):
88 """
89 @param resource:
90 @param testing: Boolean set to True when testing
91 @return: data {status: 200 ou 400, 'message': msg}
92 """
94 doibatch = get_doibatch(resource)
95 if doibatch: 95 ↛ 98line 95 didn't jump to line 98 because the condition on line 95 was always true
96 doibatch.delete()
98 doibatch = DOIBatch(resource=resource, status="En cours")
99 doibatch.save()
100 context = {}
101 context["doi_batch_id"] = f"{doibatch.pk:04d}"
102 # https://data.crossref.org/reports/help/schema_doc/4.4.2/schema_4_4_2.html#timestamp
103 timestamp = datetime.now().strftime("%Y%m%d%H%M%S%f") # len = 20, must be 19
104 context["timestamp"] = timestamp[0:19]
105 context["mail"] = settings.CROSSREF_MAIL
106 template = f"crossref/{resource.classname.lower()}_doi_register.xml"
107 crossref_user = None
108 crossref_pwd = None
110 # hack pour déterminer la date de publication pour une resource
111 if resource.classname == "Article": 111 ↛ 154line 111 didn't jump to line 154 because the condition on line 111 was always true
112 # si un article n'a pas de contributeurs, on enregistre un posted-content de type other
113 # https://data.crossref.org/reports/help/schema_doc/4.4.2/schema_4_4_2.html#posted_content
114 if not resource.get_author_contributions() and resource.classname == "Article": 114 ↛ 115line 114 didn't jump to line 115 because the condition on line 114 was never true
115 template = "crossref/posted-content.xml"
117 # on est en présence d'un objet qui a besoin d'une date de publication
118 if not resource.date_published and not resource.date_online_first: 118 ↛ 120line 118 didn't jump to line 120 because the condition on line 118 was never true
119 # on extrapole la date du volume
120 date = resource.my_container.year
121 try:
122 date = datetime.strptime(date, "%Y")
123 resource.DOIdate = "<year>%s</year>" % resource.my_container.year
124 except ValueError:
125 # on suppose que la date est du format 2010-2011, on garde la 2eme année du range
126 year = resource.my_container.year.split("-")[1]
127 resource.DOIdate = "<year>%s</year>" % year
128 resource.my_container.year = year
129 else:
130 # on renseigne la date selon le format voulu par CROSSREF
131 if resource.date_published: 131 ↛ 132line 131 didn't jump to line 132 because the condition on line 131 was never true
132 resource.DOIdate = resource.date_published.strftime(
133 "<month>%m</month><day>%d</day><year>%Y</year>"
134 )
136 # on check aussi la date du container
137 date = resource.my_container.year
138 try:
139 date = datetime.strptime(date, "%Y")
140 except ValueError:
141 # on suppose que la date est du format 2010-2011, on garde la 2eme année du range
142 year = resource.my_container.year.split("-")[1]
143 resource.my_container.year = year
144 else:
145 # Online First
146 # TODO: Is it possible to send 2 dates to Crossref ?
147 # You can send multiple <publication_date> but it is for multiple media_type (print vs online)
148 resource.DOIdate = resource.date_online_first.strftime(
149 "<month>%m</month><day>%d</day><year>%Y</year>"
150 )
152 # Le year du container vaut '0'
154 elif resource.classname == "Container":
155 if not resource.doi:
156 return {"message": "Erreur, le numéro n'a pas de doi."}
157 if resource.ctype.startswith("book"):
158 # PS : pas de gestion des chapitres pour les livres, tout est fait dans le template au moment de l'enregistrement du book
159 # template en fct du ctype
160 if resource.my_collection.issn or resource.my_collection.e_issn:
161 template = "crossref/book_series_metadata.xml"
162 else:
163 template = "crossref/book_set_metadata.xml"
164 # else #book tout seul n'appartenant pas à une série
165 # template = book_metadata
166 context["book_type"] = resource.ctype[5:].replace("-", "_")
167 for bookpart in resource.article_set.all():
168 doibatch = get_doibatch(bookpart)
169 if doibatch:
170 doibatch.delete()
171 doibatch = DOIBatch(resource=bookpart, status="En cours")
172 doibatch.save()
174 elif resource.ctype.startswith("issue_special"):
175 template = "crossref/issue_doi_register.xml"
176 title = resource.title_html
177 context["title"] = title
179 date = resource.year
180 try:
181 date = datetime.strptime(date, "%Y")
182 resource.DOIdate = "<year>%s</year>" % resource.year
183 except ValueError:
184 # on suppose que la date est du format 2010-2011, on garde la 2eme année du range
185 year = resource.year.split("-")[1]
186 resource.DOIdate = "<year>%s</year>" % year
188 elif resource.classname == "TranslatedArticle":
189 with open(
190 os.path.join(settings.LOG_DIR, "record_doi.log"), "a", encoding="utf-8"
191 ) as file_:
192 file_.write(resource.doi + "\n")
194 resource.DOIdate = resource.date_published.strftime(
195 "<month>%m</month><day>%d</day><year>%Y</year>"
196 )
197 context["collection"] = resource.original_article.get_top_collection()
199 context["resource"] = resource
201 preprint_id = preprint_type = None
202 qs = resource.extid_set.filter(id_type="preprint")
203 if qs: 203 ↛ 204line 203 didn't jump to line 204 because the condition on line 203 was never true
204 extid = qs.first()
205 preprint_id = extid.id_value
206 preprint_type = find_id_type(preprint_id)
207 # crossref allows "doi" and "arxiv", but not "hal"
208 if preprint_type == "hal":
209 preprint_type = "other"
210 context["preprint_id"] = preprint_id
211 context["preprint_type"] = preprint_type
212 abstract = resource.abstract_set.filter(lang=resource.lang, tag="abstract").first()
213 if abstract: 213 ↛ 218line 213 didn't jump to line 218 because the condition on line 213 was always true
214 context["abstract"] = xml_utils.get_jats_from_xml_with_formula(
215 abstract.value_xml, with_mathml=True
216 )
218 rdoi = None
219 qs = resource.extid_set.filter(id_type="rdoi")
220 if qs: 220 ↛ 221line 220 didn't jump to line 221 because the condition on line 220 was never true
221 rdoi = qs.first().id_value
222 context["rdoi"] = rdoi
224 try:
225 xml = render_to_string(template_name=template, context=context)
226 doibatch.xml = xml
227 doibatch.save()
228 except Exception as e:
229 if resource.classname == "TranslatedArticle":
230 with open(
231 os.path.join(settings.LOG_DIR, "record_doi.log"), "a", encoding="utf-8"
232 ) as file_:
233 file_.write(str(e) + "\n")
234 raise e
236 files = {"file": (f"{doibatch.pk}.xml", xml)}
238 data = {"status": 404}
239 if not testing: 239 ↛ 240line 239 didn't jump to line 240 because the condition on line 239 was never true
240 if resource.classname == "TranslatedArticle":
241 crossref_user, crossref_pwd = get_user_pwd_crossref(resource.original_article)
243 with open(
244 os.path.join(settings.LOG_DIR, "record_doi.log"), "a", encoding="utf-8"
245 ) as file_:
246 file_.write("Call crossref\n")
248 elif resource.classname == "Container" and resource.ctype.startswith("book"):
249 # pas de doi niveau container, alors pour obtenir les identifiants crossref on part sur le 1er book part
250 crossref_user, crossref_pwd = get_user_pwd_crossref(resource.article_set.first())
251 else:
252 crossref_user, crossref_pwd = get_user_pwd_crossref(resource)
254 crossref_batch_url = settings.CROSSREF_BATCHURL_TPL % (crossref_user, crossref_pwd)
256 r = requests.post(crossref_batch_url, files=files)
257 body = r.text.encode("utf8")
258 if r.status_code == 200:
259 xml = etree.XML(body)
260 title = xml.xpath("//*/title")[0].text
261 if title == "SUCCESS":
262 data["status"] = r.status_code
263 elif r.status_code == 401:
264 doibatch.status = "Erreur"
265 doibatch.log = "Pb d'authentification"
266 doibatch.save()
267 else:
268 doibatch.status = "Erreur"
269 doibatch.save()
270 data["message"] = body[:1000].decode("utf-8")
272 if resource.classname == "TranslatedArticle":
273 with open(
274 os.path.join(settings.LOG_DIR, "record_doi.log"), "a", encoding="utf-8"
275 ) as file_:
276 file_.write(doibatch.status + "\n")
277 return data
280def get_user_pwd_crossref(resource):
281 # get CROSSREF credentials from DOI prefix
282 doi = resource.doi
283 prefix = doi.split("/")[0]
284 md_prefix = prefix.split(".")[1]
285 crossref_user_const = "CROSSREF_USER_" + md_prefix
286 crossref_pwd_const = "CROSSREF_PWD_" + md_prefix
287 try:
288 crossref_user = getattr(settings, crossref_user_const)
289 crossref_pwd = getattr(settings, crossref_pwd_const)
290 except AttributeError:
291 crossref_user = settings.CROSSREF_USER_5802
292 crossref_pwd = settings.CROSSREF_PWD_5802
293 return crossref_user, crossref_pwd
296def checkDOIBatch(doibatch):
297 """
298 check DOI batch status by HTTP request
299 @param doibatch: DOIBatch
300 @return: DOIBatch with status and log updated
301 """
303 resource = doibatch.resource
304 crossref_user, crossref_pwd = get_user_pwd_crossref(resource)
305 url = settings.CROSSREF_BASE_CHECKBATCH_URL_TPL % (crossref_user, crossref_pwd)
306 url = url.format(doibatch.pk)
307 r = requests.get(url)
308 if r.status_code == 200:
309 # analyse du xml de retour
310 dataXml = r.text.encode("utf8")
311 tree = etree.XML(dataXml)
312 elem = tree.xpath("/doi_batch_diagnostic")[0]
313 batch_status = elem.attrib["status"]
314 if batch_status == "completed":
315 # le batch a été traité
316 doibatch.status = "batch terminé"
317 doibatch.log = "Pas de DOI associé dans le batch : voir le xml"
318 diags = tree.xpath("//*/record_diagnostic")
319 for diag in diags:
320 doi = diag.xpath("doi")[0].text
321 log = diag.xpath("msg")[0].text
322 status = diag.attrib["status"]
323 if doi == doibatch.resource.doi:
324 if status == "Success":
325 doibatch.status = "Enregistré"
326 else:
327 doibatch.status = "Erreur"
328 else:
329 doibatch.status = "Erreur"
330 doibatch.log = log
332 elif batch_status == "in_process" or batch_status == "queued":
333 doibatch.status = "En cours"
334 doibatch.log = "batch en cours de traitement"
335 else: # rafraichit trop tot apres Record DOI
336 doibatch.status = "Erreur"
337 doibatch.log = (
338 "Attention, il se peut qu'il faille rafraichir "
339 "un peu plus tard {} ".format(r.text)
340 )
341 else:
342 doibatch.status = "Erreur"
343 doibatch.log = r.text
344 doibatch.save()
345 return doibatch
348def removeOldDataInCrossref(article, testing=False):
349 """
350 The CRAS 2002-2019 articles were registered by Elsevier
351 To remove some metadata in Crossref, we need to provide a separate XML with the fields to remove
353 @param article:
354 @param testing: Boolean set to True when testing
355 @return: data {status: 200 ou 400, 'message': msg}
356 """
358 doibatch = get_doibatch(article)
359 if doibatch:
360 doibatch.delete()
362 doibatch = DOIBatch(resource=article, status="En cours")
363 doibatch.save()
365 context = {"resource": article, "doi_batch_id": f"{doibatch.pk:04d}"}
367 timestamp = datetime.now().strftime("%Y%m%d%H%M%S%f") # len = 20, must be 19
368 context["timestamp"] = timestamp[0:19]
370 context["mail"] = settings.CROSSREF_MAIL
371 template = "crossref/article_remove_old_data.xml"
373 if article.date_published:
374 article.DOIdate = article.date_published.strftime(
375 "<month>%m</month><day>%d</day><year>%Y</year>"
376 )
378 try:
379 xml = render_to_string(template_name=template, context=context)
381 if testing:
382 print(xml)
384 doibatch.xml = xml
385 doibatch.save()
386 except Exception as e:
387 raise e
389 files = {"file": (f"{doibatch.pk}.xml", xml)}
391 data = {"status": 404}
392 if not testing:
393 crossref_user, crossref_pwd = get_user_pwd_crossref(article)
394 crossref_batch_url = settings.CROSSREF_BATCHURL_TPL % (crossref_user, crossref_pwd)
396 r = requests.post(crossref_batch_url, files=files)
397 body = r.text.encode("utf8")
398 if r.status_code == 200:
399 xml = etree.XML(body)
400 title = xml.xpath("//*/title")[0].text
401 if title == "SUCCESS":
402 data["status"] = r.status_code
403 elif r.status_code == 401:
404 doibatch.status = "Erreur"
405 doibatch.log = "Pb d'authentification"
406 doibatch.save()
407 else:
408 doibatch.status = "Erreur"
409 doibatch.save()
410 data["message"] = body[:1000].decode("utf-8")
412 return data