Coverage for src/ptf_tools/doi.py: 18%

305 statements  

« prev     ^ index     » next       coverage.py v7.8.2, created at 2025-10-31 09:10 +0000

1import os 

2from datetime import datetime 

3 

4import requests 

5from django.conf import settings 

6from django.core.exceptions import ObjectDoesNotExist 

7from django.template.loader import render_to_string 

8from lxml import etree 

9from ptf.cmds.xml import xml_utils 

10from ptf.display.resolver import find_id_type 

11from ptf.templatetags.helpers import search_license 

12 

13from mersenne_tools.models import DOIBatch 

14 

15 

16def get_doibatch(resource): 

17 doibatch = None 

18 try: 

19 doibatch = resource.doibatch 

20 except ObjectDoesNotExist: 

21 pass 

22 

23 return doibatch 

24 

25 

26def checkDOIExistence(resource): 

27 """ 

28 @param models.Resource: 

29 @return: True if the doi has been recorded = the url redirects to a landing page 

30 """ 

31 

32 result = True 

33 

34 # check DOI 

35 url = settings.DOI_BASE_URL + resource.doi 

36 r = requests.get(url, allow_redirects=False) 

37 resource_url = resource.get_url_absolute() 

38 

39 if resource.classname == "Collection": 

40 # get_url_absolute returns <site_url>/item/<COLID> but doi.org redirects to <site_url> 

41 resource_url = resource.extlink_set.get(rel="website", metadata="website").location 

42 elif resource_url != r.headers["Location"] and "/item/" in r.headers["Location"]: 

43 # Old DOIs were recorded with /item/<PID> 

44 resource_url = ( 

45 resource.get_collection().extlink_set.get(rel="website", metadata="website").location 

46 ) 

47 resource_url += f"/item/{resource.pid}" 

48 

49 if r.status_code == 302 and resource_url == r.headers["Location"]: 

50 result = True 

51 

52 elif r.status_code == 302 and resource_url != r.headers["Location"]: 

53 # log = f"Mauvaise URL pour le DOI : {resource_url} - {r.headers['Location']}" 

54 result = False 

55 

56 return result 

57 

58 

59# recordDOI par resource (article) 

60# problématique liée à l'enregistrement des DOI chez CROSSREF : 

61# - pour enregistrer un DOI, on utilise le DOI du journal comme référence : CROSSREF prend ça comme une demande d'enregistrement/modification ! 

62# du DOI du journal... 

63# ce qui se passe lorsque l'on envoie plusieurs requêtes les unes à la suite des autres (Record all DOIs), c'est que l'ordre de traitement est 

64# différent (aléatoire) de l'ordre d'envoi et on obtient ces erreurs : 

65# "Record not processed because submitted version: 201810150907372216 is less or equal to previously submitted version {1}" 

66# ( MAIS le record impliqué ici est celui du journal, celui de l'article ne pose globalement pas de pb) 

67# car il y a un timestamp dans chaque requête 

68# 

69# pour contrer ces erreurs (avant on ne diagnostiquait que le nombre de failure_count et donc il y en avait une) il faut interpréter le xml de retour :: 

70# <record_diagnostic status="Success"> 

71# <doi>10.5802/alco.21</doi> 

72# <msg>Successfully updated</msg> 

73# C'est ce qui est retenu (dans checkDOIBatch). 

74# 

75 

76 

77def recordDOI(resource, testing=False): 

78 """ 

79 @param resource: 

80 @param testing: Boolean set to True when testing 

81 @return: data {status: 200 ou 400, 'message': msg} 

82 """ 

83 

84 doibatch = get_doibatch(resource) 

85 if doibatch: 85 ↛ 88line 85 didn't jump to line 88 because the condition on line 85 was always true

86 doibatch.delete() 

87 

88 doibatch = DOIBatch(resource=resource, status="En cours") 

89 doibatch.save() 

90 context = {} 

91 context["doi_batch_id"] = f"{doibatch.pk:04d}" 

92 # https://data.crossref.org/reports/help/schema_doc/4.4.2/schema_4_4_2.html#timestamp 

93 timestamp = datetime.now().strftime("%Y%m%d%H%M%S%f") # len = 20, must be 19 

94 context["timestamp"] = timestamp[0:19] 

95 context["mail"] = settings.CROSSREF_MAIL 

96 template = f"crossref/{resource.classname.lower()}_doi_register.xml" 

97 crossref_user = None 

98 crossref_pwd = None 

99 

100 # hack pour déterminer la date de publication pour une resource 

101 if resource.classname == "Article": 101 ↛ 144line 101 didn't jump to line 144 because the condition on line 101 was always true

102 # si un article n'a pas de contributeurs, on enregistre un posted-content de type other 

103 # https://data.crossref.org/reports/help/schema_doc/4.4.2/schema_4_4_2.html#posted_content 

104 if not resource.get_author_contributions() and resource.classname == "Article": 104 ↛ 105line 104 didn't jump to line 105 because the condition on line 104 was never true

105 template = "crossref/posted-content.xml" 

106 

107 # on est en présence d'un objet qui a besoin d'une date de publication 

108 if not resource.date_published and not resource.date_online_first: 108 ↛ 110line 108 didn't jump to line 110 because the condition on line 108 was never true

109 # on extrapole la date du volume 

110 date = resource.my_container.year 

111 try: 

112 date = datetime.strptime(date, "%Y") 

113 resource.DOIdate = "<year>%s</year>" % resource.my_container.year 

114 except ValueError: 

115 # on suppose que la date est du format 2010-2011, on garde la 2eme année du range 

116 year = resource.my_container.year.split("-")[1] 

117 resource.DOIdate = "<year>%s</year>" % year 

118 resource.my_container.year = year 

119 else: 

120 # on renseigne la date selon le format voulu par CROSSREF 

121 if resource.date_published: 121 ↛ 122line 121 didn't jump to line 122 because the condition on line 121 was never true

122 resource.DOIdate = resource.date_published.strftime( 

123 "<month>%m</month><day>%d</day><year>%Y</year>" 

124 ) 

125 

126 # on check aussi la date du container 

127 date = resource.my_container.year 

128 try: 

129 date = datetime.strptime(date, "%Y") 

130 except ValueError: 

131 # on suppose que la date est du format 2010-2011, on garde la 2eme année du range 

132 year = resource.my_container.year.split("-")[1] 

133 resource.my_container.year = year 

134 else: 

135 # Online First 

136 # TODO: Is it possible to send 2 dates to Crossref ? 

137 # You can send multiple <publication_date> but it is for multiple media_type (print vs online) 

138 resource.DOIdate = resource.date_online_first.strftime( 

139 "<month>%m</month><day>%d</day><year>%Y</year>" 

140 ) 

141 

142 # Le year du container vaut '0' 

143 

144 elif resource.classname == "Container": 

145 if not resource.doi: 

146 return {"message": "Erreur, le numéro n'a pas de doi."} 

147 if resource.ctype.startswith("book"): 

148 # PS : pas de gestion des chapitres pour les livres, tout est fait dans le template au moment de l'enregistrement du book 

149 # template en fct du ctype 

150 if resource.my_collection.issn or resource.my_collection.e_issn: 

151 template = "crossref/book_series_metadata.xml" 

152 else: 

153 template = "crossref/book_set_metadata.xml" 

154 # else #book tout seul n'appartenant pas à une série 

155 # template = book_metadata 

156 context["book_type"] = resource.ctype[5:].replace("-", "_") 

157 for bookpart in resource.article_set.all(): 

158 doibatch = get_doibatch(bookpart) 

159 if doibatch: 

160 doibatch.delete() 

161 doibatch = DOIBatch(resource=bookpart, status="En cours") 

162 doibatch.save() 

163 

164 elif resource.ctype.startswith("issue_special"): 

165 template = "crossref/issue_doi_register.xml" 

166 title = resource.title_html 

167 context["title"] = title 

168 

169 date = resource.year 

170 try: 

171 date = datetime.strptime(date, "%Y") 

172 resource.DOIdate = "<year>%s</year>" % resource.year 

173 except ValueError: 

174 # on suppose que la date est du format 2010-2011, on garde la 2eme année du range 

175 year = resource.year.split("-")[1] 

176 resource.DOIdate = "<year>%s</year>" % year 

177 

178 elif resource.classname == "TranslatedArticle": 

179 with open( 

180 os.path.join(settings.LOG_DIR, "record_doi.log"), "a", encoding="utf-8" 

181 ) as file_: 

182 file_.write(resource.doi + "\n") 

183 

184 resource.DOIdate = resource.date_published.strftime( 

185 "<month>%m</month><day>%d</day><year>%Y</year>" 

186 ) 

187 context["collection"] = resource.original_article.get_top_collection() 

188 

189 context["resource"] = resource 

190 

191 preprint_id = preprint_type = None 

192 qs = resource.extid_set.filter(id_type="preprint") 

193 if qs: 193 ↛ 194line 193 didn't jump to line 194 because the condition on line 193 was never true

194 extid = qs.first() 

195 preprint_id = extid.id_value 

196 preprint_type = find_id_type(preprint_id) 

197 # crossref allows "doi" and "arxiv", but not "hal" 

198 if preprint_type == "hal": 

199 preprint_type = "other" 

200 context["preprint_id"] = preprint_id 

201 context["preprint_type"] = preprint_type 

202 abstract = resource.abstract_set.filter(lang=resource.lang, tag="abstract").first() 

203 if abstract: 203 ↛ 207line 203 didn't jump to line 207 because the condition on line 203 was always true

204 context["abstract"] = xml_utils.get_crossref_jats_from_xml_with_formula( 

205 abstract.value_xml, with_mathml=True 

206 ) 

207 if hasattr(resource, "licence") and resource.license is not None: 207 ↛ 208line 207 didn't jump to line 208 because the condition on line 207 was never true

208 context["license"] = search_license(resource) 

209 

210 rdoi = None 

211 qs = resource.extid_set.filter(id_type="rdoi") 

212 if qs: 212 ↛ 213line 212 didn't jump to line 213 because the condition on line 212 was never true

213 rdoi = qs.first().id_value 

214 context["rdoi"] = rdoi 

215 

216 try: 

217 xml = render_to_string(template_name=template, context=context) 

218 doibatch.xml = xml 

219 doibatch.save() 

220 except Exception as e: 

221 if resource.classname == "TranslatedArticle": 

222 with open( 

223 os.path.join(settings.LOG_DIR, "record_doi.log"), "a", encoding="utf-8" 

224 ) as file_: 

225 file_.write(str(e) + "\n") 

226 raise e 

227 

228 files = {"file": (f"{doibatch.pk}.xml", xml)} 

229 

230 data = {"status": 404} 

231 if not testing: 231 ↛ 232line 231 didn't jump to line 232 because the condition on line 231 was never true

232 if resource.classname == "TranslatedArticle": 

233 crossref_user, crossref_pwd = get_user_pwd_crossref(resource.original_article) 

234 

235 with open( 

236 os.path.join(settings.LOG_DIR, "record_doi.log"), "a", encoding="utf-8" 

237 ) as file_: 

238 file_.write("Call crossref\n") 

239 

240 elif resource.classname == "Container" and resource.ctype.startswith("book"): 

241 # pas de doi niveau container, alors pour obtenir les identifiants crossref on part sur le 1er book part 

242 crossref_user, crossref_pwd = get_user_pwd_crossref(resource.article_set.first()) 

243 else: 

244 crossref_user, crossref_pwd = get_user_pwd_crossref(resource) 

245 

246 crossref_batch_url = settings.CROSSREF_BATCHURL_TPL % (crossref_user, crossref_pwd) 

247 

248 r = requests.post(crossref_batch_url, files=files) 

249 body = r.text.encode("utf8") 

250 if r.status_code == 200: 

251 xml = etree.XML(body) 

252 title = xml.xpath("//*/title")[0].text 

253 if title == "SUCCESS": 

254 data["status"] = r.status_code 

255 elif r.status_code == 401: 

256 doibatch.status = "Erreur" 

257 doibatch.log = "Pb d'authentification" 

258 doibatch.save() 

259 else: 

260 doibatch.status = "Erreur" 

261 doibatch.save() 

262 data["message"] = body[:1000].decode("utf-8") 

263 

264 if resource.classname == "TranslatedArticle": 

265 with open( 

266 os.path.join(settings.LOG_DIR, "record_doi.log"), "a", encoding="utf-8" 

267 ) as file_: 

268 file_.write(doibatch.status + "\n") 

269 return data 

270 

271 

272def recordPendingPublication(resource, testing=False): 

273 """ 

274 @param resource: 

275 @param testing: Boolean set to True when testing 

276 @return: data {status: 200 ou 400, 'message': msg} 

277 """ 

278 

279 doibatch = get_doibatch(resource) 

280 if doibatch: 

281 doibatch.delete() 

282 

283 doibatch = DOIBatch(resource=resource, status="En cours") 

284 doibatch.save() 

285 context = {} 

286 context["doi_batch_id"] = f"{doibatch.pk:04d}" 

287 # https://data.crossref.org/reports/help/schema_doc/4.4.2/schema_4_4_2.html#timestamp 

288 timestamp = datetime.now().strftime("%Y%m%d%H%M%S%f") # len = 20, must be 19 

289 context["timestamp"] = timestamp[0:19] 

290 context["mail"] = settings.CROSSREF_MAIL 

291 template = f"crossref/{resource.classname.lower()}_pending_publication_register.xml" 

292 crossref_user = None 

293 crossref_pwd = None 

294 

295 if resource.classname == "Article": 

296 date = resource.date_accepted 

297 resource.month_accepted = date.month 

298 resource.day_accepted = date.day 

299 resource.year_accepted = date.year 

300 

301 context["resource"] = resource 

302 

303 try: 

304 xml = render_to_string(template_name=template, context=context) 

305 doibatch.xml = xml 

306 doibatch.save() 

307 except Exception as e: 

308 raise e 

309 

310 files = {"file": (f"{doibatch.pk}.xml", xml)} 

311 

312 data = {"status": 404} 

313 if not testing: 

314 crossref_user, crossref_pwd = get_user_pwd_crossref(resource) 

315 crossref_batch_url = settings.CROSSREF_BATCHURL_TPL % (crossref_user, crossref_pwd) 

316 

317 r = requests.post(crossref_batch_url, files=files) 

318 body = r.text.encode("utf8") 

319 if r.status_code == 200: 

320 xml = etree.XML(body) 

321 title = xml.xpath("//*/title")[0].text 

322 if title == "SUCCESS": 

323 data["status"] = r.status_code 

324 elif r.status_code == 401: 

325 doibatch.status = "Erreur" 

326 doibatch.log = "Pb d'authentification" 

327 doibatch.save() 

328 else: 

329 doibatch.status = "Erreur" 

330 doibatch.save() 

331 data["message"] = body[:1000].decode("utf-8") 

332 

333 if resource.classname == "TranslatedArticle": 

334 with open( 

335 os.path.join(settings.LOG_DIR, "record_doi.log"), "a", encoding="utf-8" 

336 ) as file_: 

337 file_.write(doibatch.status + "\n") 

338 return data 

339 

340 

341def get_user_pwd_crossref(resource): 

342 # get CROSSREF credentials from DOI prefix 

343 if resource.classname == "Collection": 

344 doi = resource.get_top_collection().doi 

345 else: 

346 doi = resource.doi 

347 

348 prefix = doi.split("/")[0] 

349 md_prefix = prefix.split(".")[1] 

350 crossref_user_const = "CROSSREF_USER_" + md_prefix 

351 crossref_pwd_const = "CROSSREF_PWD_" + md_prefix 

352 try: 

353 crossref_user = getattr(settings, crossref_user_const) 

354 crossref_pwd = getattr(settings, crossref_pwd_const) 

355 except AttributeError: 

356 crossref_user = settings.CROSSREF_USER_5802 

357 crossref_pwd = settings.CROSSREF_PWD_5802 

358 return crossref_user, crossref_pwd 

359 

360 

361def checkDOI(resource): 

362 """ 

363 check DOI status using the resource DOIBatch 

364 @param resource: the resource to check 

365 @return: DOIBatch with status and log updated or None if the resource has no DOIBatch 

366 """ 

367 # si on est dans le cas d'un book-part vu que l'enregistrement se fait niveau container, on ne peut pas interroger le batch 

368 # lié au book-part, car il a été créé juste pour afficher "En cours" sur le niveau book-part 

369 if ( 

370 resource.classname == "Article" 

371 and resource.my_container.ctype.startswith("book") 

372 and resource.my_container.ctype != "book-lecture-notes" 

373 ): 

374 return None 

375 

376 doibatch = get_doibatch(resource) 

377 if doibatch is None: 

378 doibatch = DOIBatch(resource=resource, status="En cours") 

379 doibatch.save() 

380 

381 doibatch = checkDOIBatch(doibatch) 

382 

383 return doibatch 

384 

385 

386def checkDOIBatch(doibatch): 

387 """ 

388 check DOI batch status by HTTP request 

389 @param doibatch: DOIBatch 

390 @return: DOIBatch with status and log updated 

391 """ 

392 

393 # si on est dans le cas d'un book-part vu que l'enregistrement se fait niveau container, on ne peut pas interroger le batch 

394 # lié au book-part, car il a été créé juste pour afficher "En cours" sur le niveau book-part 

395 # if resource.classname == "Article" and resource.my_container.ctype.startswith("book"): 

396 # return None 

397 # 

398 # doibatch = get_doibatch(resource) 

399 # if doibatch is not None: 

400 # doibatch = checkDOIBatch(doibatch) 

401 # 

402 # return doibatch 

403 

404 resource = doibatch.resource 

405 crossref_user, crossref_pwd = get_user_pwd_crossref(resource) 

406 url = settings.CROSSREF_BASE_CHECKBATCH_URL_TPL % (crossref_user, crossref_pwd) 

407 url = url.format(doibatch.pk) 

408 r = requests.get(url) 

409 if r.status_code == 200: 

410 # analyse du xml de retour 

411 dataXml = r.text.encode("utf8") 

412 tree = etree.XML(dataXml) 

413 elem = tree.xpath("/doi_batch_diagnostic")[0] 

414 batch_status = elem.attrib["status"] 

415 if batch_status == "completed": 

416 # le batch a été traité 

417 doibatch.status = "batch terminé" 

418 doibatch.log = "Pas de DOI associé dans le batch : voir le xml" 

419 diags = tree.xpath("//*/record_diagnostic") 

420 for diag in diags: 

421 doi = diag.xpath("doi")[0].text 

422 log = diag.xpath("msg")[0].text 

423 status = diag.attrib["status"] 

424 if doi == doibatch.resource.doi: 

425 if status == "Success": 

426 doibatch.status = "Enregistré" 

427 else: 

428 doibatch.status = "Erreur" 

429 else: 

430 doibatch.status = "Erreur" 

431 doibatch.log = log 

432 

433 elif batch_status == "in_process" or batch_status == "queued": 

434 doibatch.status = "En cours" 

435 doibatch.log = "batch en cours de traitement" 

436 else: # rafraichit trop tot apres Record DOI 

437 doibatch.status = "Erreur" 

438 doibatch.log = ( 

439 f"Attention, il se peut qu'il faille rafraichir un peu plus tard {r.text} " 

440 ) 

441 else: 

442 doibatch.status = "Erreur" 

443 doibatch.log = r.text 

444 doibatch.save() 

445 return doibatch 

446 

447 

448def removeOldDataInCrossref(article, testing=False): 

449 """ 

450 The CRAS 2002-2019 articles were registered by Elsevier 

451 To remove some metadata in Crossref, we need to provide a separate XML with the fields to remove 

452 

453 @param article: 

454 @param testing: Boolean set to True when testing 

455 @return: data {status: 200 ou 400, 'message': msg} 

456 """ 

457 

458 doibatch = get_doibatch(article) 

459 if doibatch: 

460 doibatch.delete() 

461 

462 doibatch = DOIBatch(resource=article, status="En cours") 

463 doibatch.save() 

464 

465 context = {"resource": article, "doi_batch_id": f"{doibatch.pk:04d}"} 

466 

467 timestamp = datetime.now().strftime("%Y%m%d%H%M%S%f") # len = 20, must be 19 

468 context["timestamp"] = timestamp[0:19] 

469 

470 context["mail"] = settings.CROSSREF_MAIL 

471 template = "crossref/article_remove_old_data.xml" 

472 

473 if article.date_published: 

474 article.DOIdate = article.date_published.strftime( 

475 "<month>%m</month><day>%d</day><year>%Y</year>" 

476 ) 

477 

478 try: 

479 xml = render_to_string(template_name=template, context=context) 

480 

481 if testing: 

482 print(xml) 

483 

484 doibatch.xml = xml 

485 doibatch.save() 

486 except Exception as e: 

487 raise e 

488 

489 files = {"file": (f"{doibatch.pk}.xml", xml)} 

490 

491 data = {"status": 404} 

492 if not testing: 

493 crossref_user, crossref_pwd = get_user_pwd_crossref(article) 

494 crossref_batch_url = settings.CROSSREF_BATCHURL_TPL % (crossref_user, crossref_pwd) 

495 

496 r = requests.post(crossref_batch_url, files=files) 

497 body = r.text.encode("utf8") 

498 if r.status_code == 200: 

499 xml = etree.XML(body) 

500 title = xml.xpath("//*/title")[0].text 

501 if title == "SUCCESS": 

502 data["status"] = r.status_code 

503 elif r.status_code == 401: 

504 doibatch.status = "Erreur" 

505 doibatch.log = "Pb d'authentification" 

506 doibatch.save() 

507 else: 

508 doibatch.status = "Erreur" 

509 doibatch.save() 

510 data["message"] = body[:1000].decode("utf-8") 

511 

512 return data