Coverage for src/ptf_tools/doi.py: 21%

253 statements  

« prev     ^ index     » next       coverage.py v7.7.0, created at 2025-04-03 12:11 +0000

1import os 

2from datetime import datetime 

3 

4import requests 

5from django.conf import settings 

6from django.core.exceptions import ObjectDoesNotExist 

7from django.template.loader import render_to_string 

8from lxml import etree 

9from ptf.cmds.xml import xml_utils 

10from ptf.display.resolver import find_id_type 

11 

12from mersenne_tools.models import DOIBatch 

13 

14 

15def get_doibatch(resource): 

16 doibatch = None 

17 try: 

18 doibatch = resource.doibatch 

19 except ObjectDoesNotExist: 

20 pass 

21 

22 return doibatch 

23 

24 

25def get_or_create_doibatch(resource): 

26 """ 

27 @param models.Resource: 

28 @return: new or updated doibatch 

29 """ 

30 

31 update_doi = False 

32 # check DOI 

33 url = settings.DOI_BASE_URL + resource.doi 

34 r = requests.get(url, allow_redirects=False) 

35 if r.status_code == 302 and resource.get_url_absolute() == r.headers["Location"]: 

36 status = "Enregistré" 

37 log = "Vérifié sur CROSSREF" 

38 update_doi = True 

39 elif r.status_code == 302 and resource.get_url_absolute() != r.headers["Location"]: 

40 status = "Erreur" 

41 log = "Mauvaise URL pour le DOI !!!/à réenregistrer" 

42 update_doi = True 

43 doibatch = get_doibatch(resource) 

44 if update_doi: 

45 if doibatch: 

46 doibatch.status = status 

47 doibatch.log = log 

48 else: 

49 doibatch = DOIBatch(resource=resource, status=status, log=log) 

50 doibatch.save() 

51 return doibatch 

52 

53 # si on est dans le cas d'un book-part vu que l'enregistrement se fait niveau container, on ne peut pas interroger le batch 

54 # lié au book-part, car il a été créé juste pour afficher "En cours" sur le niveau book-part 

55 if ( 

56 doibatch 

57 and resource.classname == "Article" 

58 and resource.my_container.ctype.startswith("book") 

59 ): 

60 doibatch.delete() 

61 doibatch = None 

62 

63 if doibatch: 

64 doibatch = checkDOIBatch(doibatch) 

65 

66 return doibatch 

67 

68 

69# recordDOI par resource (article) 

70# problématique liée à l'enregistrement des DOI chez CROSSREF : 

71# - pour enregistrer un DOI, on utilise le DOI du journal comme référence : CROSSREF prend ça comme une demande d'enregistrement/modification ! 

72# du DOI du journal... 

73# ce qui se passe lorsque l'on envoie plusieurs requêtes les unes à la suite des autres (Record all DOIs), c'est que l'ordre de traitement est 

74# différent (aléatoire) de l'ordre d'envoi et on obtient ces erreurs : 

75# "Record not processed because submitted version: 201810150907372216 is less or equal to previously submitted version {1}" 

76# ( MAIS le record impliqué ici est celui du journal, celui de l'article ne pose globalement pas de pb) 

77# car il y a un timestamp dans chaque requête 

78# 

79# pour contrer ces erreurs (avant on ne diagnostiquait que le nombre de failure_count et donc il y en avait une) il faut interpréter le xml de retour :: 

80# <record_diagnostic status="Success"> 

81# <doi>10.5802/alco.21</doi> 

82# <msg>Successfully updated</msg> 

83# C'est ce qui est retenu (dans checkDOIBatch). 

84# 

85 

86 

87def recordDOI(resource, testing=False): 

88 """ 

89 @param resource: 

90 @param testing: Boolean set to True when testing 

91 @return: data {status: 200 ou 400, 'message': msg} 

92 """ 

93 

94 doibatch = get_doibatch(resource) 

95 if doibatch: 95 ↛ 98line 95 didn't jump to line 98 because the condition on line 95 was always true

96 doibatch.delete() 

97 

98 doibatch = DOIBatch(resource=resource, status="En cours") 

99 doibatch.save() 

100 context = {} 

101 context["doi_batch_id"] = f"{doibatch.pk:04d}" 

102 # https://data.crossref.org/reports/help/schema_doc/4.4.2/schema_4_4_2.html#timestamp 

103 timestamp = datetime.now().strftime("%Y%m%d%H%M%S%f") # len = 20, must be 19 

104 context["timestamp"] = timestamp[0:19] 

105 context["mail"] = settings.CROSSREF_MAIL 

106 template = f"crossref/{resource.classname.lower()}_doi_register.xml" 

107 crossref_user = None 

108 crossref_pwd = None 

109 

110 # hack pour déterminer la date de publication pour une resource 

111 if resource.classname == "Article": 111 ↛ 154line 111 didn't jump to line 154 because the condition on line 111 was always true

112 # si un article n'a pas de contributeurs, on enregistre un posted-content de type other 

113 # https://data.crossref.org/reports/help/schema_doc/4.4.2/schema_4_4_2.html#posted_content 

114 if not resource.get_author_contributions() and resource.classname == "Article": 114 ↛ 115line 114 didn't jump to line 115 because the condition on line 114 was never true

115 template = "crossref/posted-content.xml" 

116 

117 # on est en présence d'un objet qui a besoin d'une date de publication 

118 if not resource.date_published and not resource.date_online_first: 118 ↛ 120line 118 didn't jump to line 120 because the condition on line 118 was never true

119 # on extrapole la date du volume 

120 date = resource.my_container.year 

121 try: 

122 date = datetime.strptime(date, "%Y") 

123 resource.DOIdate = "<year>%s</year>" % resource.my_container.year 

124 except ValueError: 

125 # on suppose que la date est du format 2010-2011, on garde la 2eme année du range 

126 year = resource.my_container.year.split("-")[1] 

127 resource.DOIdate = "<year>%s</year>" % year 

128 resource.my_container.year = year 

129 else: 

130 # on renseigne la date selon le format voulu par CROSSREF 

131 if resource.date_published: 131 ↛ 132line 131 didn't jump to line 132 because the condition on line 131 was never true

132 resource.DOIdate = resource.date_published.strftime( 

133 "<month>%m</month><day>%d</day><year>%Y</year>" 

134 ) 

135 

136 # on check aussi la date du container 

137 date = resource.my_container.year 

138 try: 

139 date = datetime.strptime(date, "%Y") 

140 except ValueError: 

141 # on suppose que la date est du format 2010-2011, on garde la 2eme année du range 

142 year = resource.my_container.year.split("-")[1] 

143 resource.my_container.year = year 

144 else: 

145 # Online First 

146 # TODO: Is it possible to send 2 dates to Crossref ? 

147 # You can send multiple <publication_date> but it is for multiple media_type (print vs online) 

148 resource.DOIdate = resource.date_online_first.strftime( 

149 "<month>%m</month><day>%d</day><year>%Y</year>" 

150 ) 

151 

152 # Le year du container vaut '0' 

153 

154 elif resource.classname == "Container": 

155 if not resource.doi: 

156 return {"message": "Erreur, le numéro n'a pas de doi."} 

157 if resource.ctype.startswith("book"): 

158 # PS : pas de gestion des chapitres pour les livres, tout est fait dans le template au moment de l'enregistrement du book 

159 # template en fct du ctype 

160 if resource.my_collection.issn or resource.my_collection.e_issn: 

161 template = "crossref/book_series_metadata.xml" 

162 else: 

163 template = "crossref/book_set_metadata.xml" 

164 # else #book tout seul n'appartenant pas à une série 

165 # template = book_metadata 

166 context["book_type"] = resource.ctype[5:].replace("-", "_") 

167 for bookpart in resource.article_set.all(): 

168 doibatch = get_doibatch(bookpart) 

169 if doibatch: 

170 doibatch.delete() 

171 doibatch = DOIBatch(resource=bookpart, status="En cours") 

172 doibatch.save() 

173 

174 elif resource.ctype.startswith("issue_special"): 

175 template = "crossref/issue_doi_register.xml" 

176 title = resource.title_html 

177 context["title"] = title 

178 

179 date = resource.year 

180 try: 

181 date = datetime.strptime(date, "%Y") 

182 resource.DOIdate = "<year>%s</year>" % resource.year 

183 except ValueError: 

184 # on suppose que la date est du format 2010-2011, on garde la 2eme année du range 

185 year = resource.year.split("-")[1] 

186 resource.DOIdate = "<year>%s</year>" % year 

187 

188 elif resource.classname == "TranslatedArticle": 

189 with open( 

190 os.path.join(settings.LOG_DIR, "record_doi.log"), "a", encoding="utf-8" 

191 ) as file_: 

192 file_.write(resource.doi + "\n") 

193 

194 resource.DOIdate = resource.date_published.strftime( 

195 "<month>%m</month><day>%d</day><year>%Y</year>" 

196 ) 

197 context["collection"] = resource.original_article.get_top_collection() 

198 

199 context["resource"] = resource 

200 

201 preprint_id = preprint_type = None 

202 qs = resource.extid_set.filter(id_type="preprint") 

203 if qs: 203 ↛ 204line 203 didn't jump to line 204 because the condition on line 203 was never true

204 extid = qs.first() 

205 preprint_id = extid.id_value 

206 preprint_type = find_id_type(preprint_id) 

207 # crossref allows "doi" and "arxiv", but not "hal" 

208 if preprint_type == "hal": 

209 preprint_type = "other" 

210 context["preprint_id"] = preprint_id 

211 context["preprint_type"] = preprint_type 

212 abstract = resource.abstract_set.filter(lang=resource.lang, tag="abstract").first() 

213 if abstract: 213 ↛ 218line 213 didn't jump to line 218 because the condition on line 213 was always true

214 context["abstract"] = xml_utils.get_jats_from_xml_with_formula( 

215 abstract.value_xml, with_mathml=True 

216 ) 

217 

218 rdoi = None 

219 qs = resource.extid_set.filter(id_type="rdoi") 

220 if qs: 220 ↛ 221line 220 didn't jump to line 221 because the condition on line 220 was never true

221 rdoi = qs.first().id_value 

222 context["rdoi"] = rdoi 

223 

224 try: 

225 xml = render_to_string(template_name=template, context=context) 

226 doibatch.xml = xml 

227 doibatch.save() 

228 except Exception as e: 

229 if resource.classname == "TranslatedArticle": 

230 with open( 

231 os.path.join(settings.LOG_DIR, "record_doi.log"), "a", encoding="utf-8" 

232 ) as file_: 

233 file_.write(str(e) + "\n") 

234 raise e 

235 

236 files = {"file": (f"{doibatch.pk}.xml", xml)} 

237 

238 data = {"status": 404} 

239 if not testing: 239 ↛ 240line 239 didn't jump to line 240 because the condition on line 239 was never true

240 if resource.classname == "TranslatedArticle": 

241 crossref_user, crossref_pwd = get_user_pwd_crossref(resource.original_article) 

242 

243 with open( 

244 os.path.join(settings.LOG_DIR, "record_doi.log"), "a", encoding="utf-8" 

245 ) as file_: 

246 file_.write("Call crossref\n") 

247 

248 elif resource.classname == "Container" and resource.ctype.startswith("book"): 

249 # pas de doi niveau container, alors pour obtenir les identifiants crossref on part sur le 1er book part 

250 crossref_user, crossref_pwd = get_user_pwd_crossref(resource.article_set.first()) 

251 else: 

252 crossref_user, crossref_pwd = get_user_pwd_crossref(resource) 

253 

254 crossref_batch_url = settings.CROSSREF_BATCHURL_TPL % (crossref_user, crossref_pwd) 

255 

256 r = requests.post(crossref_batch_url, files=files) 

257 body = r.text.encode("utf8") 

258 if r.status_code == 200: 

259 xml = etree.XML(body) 

260 title = xml.xpath("//*/title")[0].text 

261 if title == "SUCCESS": 

262 data["status"] = r.status_code 

263 elif r.status_code == 401: 

264 doibatch.status = "Erreur" 

265 doibatch.log = "Pb d'authentification" 

266 doibatch.save() 

267 else: 

268 doibatch.status = "Erreur" 

269 doibatch.save() 

270 data["message"] = body[:1000].decode("utf-8") 

271 

272 if resource.classname == "TranslatedArticle": 

273 with open( 

274 os.path.join(settings.LOG_DIR, "record_doi.log"), "a", encoding="utf-8" 

275 ) as file_: 

276 file_.write(doibatch.status + "\n") 

277 return data 

278 

279 

280def get_user_pwd_crossref(resource): 

281 # get CROSSREF credentials from DOI prefix 

282 doi = resource.doi 

283 prefix = doi.split("/")[0] 

284 md_prefix = prefix.split(".")[1] 

285 crossref_user_const = "CROSSREF_USER_" + md_prefix 

286 crossref_pwd_const = "CROSSREF_PWD_" + md_prefix 

287 try: 

288 crossref_user = getattr(settings, crossref_user_const) 

289 crossref_pwd = getattr(settings, crossref_pwd_const) 

290 except AttributeError: 

291 crossref_user = settings.CROSSREF_USER_5802 

292 crossref_pwd = settings.CROSSREF_PWD_5802 

293 return crossref_user, crossref_pwd 

294 

295 

296def checkDOIBatch(doibatch): 

297 """ 

298 check DOI batch status by HTTP request 

299 @param doibatch: DOIBatch 

300 @return: DOIBatch with status and log updated 

301 """ 

302 

303 resource = doibatch.resource 

304 crossref_user, crossref_pwd = get_user_pwd_crossref(resource) 

305 url = settings.CROSSREF_BASE_CHECKBATCH_URL_TPL % (crossref_user, crossref_pwd) 

306 url = url.format(doibatch.pk) 

307 r = requests.get(url) 

308 if r.status_code == 200: 

309 # analyse du xml de retour 

310 dataXml = r.text.encode("utf8") 

311 tree = etree.XML(dataXml) 

312 elem = tree.xpath("/doi_batch_diagnostic")[0] 

313 batch_status = elem.attrib["status"] 

314 if batch_status == "completed": 

315 # le batch a été traité 

316 doibatch.status = "batch terminé" 

317 doibatch.log = "Pas de DOI associé dans le batch : voir le xml" 

318 diags = tree.xpath("//*/record_diagnostic") 

319 for diag in diags: 

320 doi = diag.xpath("doi")[0].text 

321 log = diag.xpath("msg")[0].text 

322 status = diag.attrib["status"] 

323 if doi == doibatch.resource.doi: 

324 if status == "Success": 

325 doibatch.status = "Enregistré" 

326 else: 

327 doibatch.status = "Erreur" 

328 else: 

329 doibatch.status = "Erreur" 

330 doibatch.log = log 

331 

332 elif batch_status == "in_process" or batch_status == "queued": 

333 doibatch.status = "En cours" 

334 doibatch.log = "batch en cours de traitement" 

335 else: # rafraichit trop tot apres Record DOI 

336 doibatch.status = "Erreur" 

337 doibatch.log = ( 

338 "Attention, il se peut qu'il faille rafraichir " 

339 "un peu plus tard {} ".format(r.text) 

340 ) 

341 else: 

342 doibatch.status = "Erreur" 

343 doibatch.log = r.text 

344 doibatch.save() 

345 return doibatch 

346 

347 

348def removeOldDataInCrossref(article, testing=False): 

349 """ 

350 The CRAS 2002-2019 articles were registered by Elsevier 

351 To remove some metadata in Crossref, we need to provide a separate XML with the fields to remove 

352 

353 @param article: 

354 @param testing: Boolean set to True when testing 

355 @return: data {status: 200 ou 400, 'message': msg} 

356 """ 

357 

358 doibatch = get_doibatch(article) 

359 if doibatch: 

360 doibatch.delete() 

361 

362 doibatch = DOIBatch(resource=article, status="En cours") 

363 doibatch.save() 

364 

365 context = {"resource": article, "doi_batch_id": f"{doibatch.pk:04d}"} 

366 

367 timestamp = datetime.now().strftime("%Y%m%d%H%M%S%f") # len = 20, must be 19 

368 context["timestamp"] = timestamp[0:19] 

369 

370 context["mail"] = settings.CROSSREF_MAIL 

371 template = "crossref/article_remove_old_data.xml" 

372 

373 if article.date_published: 

374 article.DOIdate = article.date_published.strftime( 

375 "<month>%m</month><day>%d</day><year>%Y</year>" 

376 ) 

377 

378 try: 

379 xml = render_to_string(template_name=template, context=context) 

380 

381 if testing: 

382 print(xml) 

383 

384 doibatch.xml = xml 

385 doibatch.save() 

386 except Exception as e: 

387 raise e 

388 

389 files = {"file": (f"{doibatch.pk}.xml", xml)} 

390 

391 data = {"status": 404} 

392 if not testing: 

393 crossref_user, crossref_pwd = get_user_pwd_crossref(article) 

394 crossref_batch_url = settings.CROSSREF_BATCHURL_TPL % (crossref_user, crossref_pwd) 

395 

396 r = requests.post(crossref_batch_url, files=files) 

397 body = r.text.encode("utf8") 

398 if r.status_code == 200: 

399 xml = etree.XML(body) 

400 title = xml.xpath("//*/title")[0].text 

401 if title == "SUCCESS": 

402 data["status"] = r.status_code 

403 elif r.status_code == 401: 

404 doibatch.status = "Erreur" 

405 doibatch.log = "Pb d'authentification" 

406 doibatch.save() 

407 else: 

408 doibatch.status = "Erreur" 

409 doibatch.save() 

410 data["message"] = body[:1000].decode("utf-8") 

411 

412 return data