Coverage for src/ptf_tools/doi.py: 21%

256 statements  

« prev     ^ index     » next       coverage.py v7.8.2, created at 2025-06-25 10:06 +0000

1import os 

2from datetime import datetime 

3 

4import requests 

5from django.conf import settings 

6from django.core.exceptions import ObjectDoesNotExist 

7from django.template.loader import render_to_string 

8from lxml import etree 

9from ptf.cmds.xml import xml_utils 

10from ptf.display.resolver import find_id_type 

11from ptf.templatetags.helpers import search_license 

12 

13from mersenne_tools.models import DOIBatch 

14 

15 

16def get_doibatch(resource): 

17 doibatch = None 

18 try: 

19 doibatch = resource.doibatch 

20 except ObjectDoesNotExist: 

21 pass 

22 

23 return doibatch 

24 

25 

26def get_or_create_doibatch(resource): 

27 """ 

28 @param models.Resource: 

29 @return: new or updated doibatch 

30 """ 

31 

32 update_doi = False 

33 # check DOI 

34 url = settings.DOI_BASE_URL + resource.doi 

35 r = requests.get(url, allow_redirects=False) 

36 if r.status_code == 302 and resource.get_url_absolute() == r.headers["Location"]: 

37 status = "Enregistré" 

38 log = "Vérifié sur CROSSREF" 

39 update_doi = True 

40 elif r.status_code == 302 and resource.get_url_absolute() != r.headers["Location"]: 

41 status = "Erreur" 

42 log = "Mauvaise URL pour le DOI !!!/à réenregistrer" 

43 update_doi = True 

44 doibatch = get_doibatch(resource) 

45 if update_doi: 

46 if doibatch: 

47 doibatch.status = status 

48 doibatch.log = log 

49 else: 

50 doibatch = DOIBatch(resource=resource, status=status, log=log) 

51 doibatch.save() 

52 return doibatch 

53 

54 # si on est dans le cas d'un book-part vu que l'enregistrement se fait niveau container, on ne peut pas interroger le batch 

55 # lié au book-part, car il a été créé juste pour afficher "En cours" sur le niveau book-part 

56 if ( 

57 doibatch 

58 and resource.classname == "Article" 

59 and resource.my_container.ctype.startswith("book") 

60 ): 

61 doibatch.delete() 

62 doibatch = None 

63 

64 if doibatch: 

65 doibatch = checkDOIBatch(doibatch) 

66 

67 return doibatch 

68 

69 

70# recordDOI par resource (article) 

71# problématique liée à l'enregistrement des DOI chez CROSSREF : 

72# - pour enregistrer un DOI, on utilise le DOI du journal comme référence : CROSSREF prend ça comme une demande d'enregistrement/modification ! 

73# du DOI du journal... 

74# ce qui se passe lorsque l'on envoie plusieurs requêtes les unes à la suite des autres (Record all DOIs), c'est que l'ordre de traitement est 

75# différent (aléatoire) de l'ordre d'envoi et on obtient ces erreurs : 

76# "Record not processed because submitted version: 201810150907372216 is less or equal to previously submitted version {1}" 

77# ( MAIS le record impliqué ici est celui du journal, celui de l'article ne pose globalement pas de pb) 

78# car il y a un timestamp dans chaque requête 

79# 

80# pour contrer ces erreurs (avant on ne diagnostiquait que le nombre de failure_count et donc il y en avait une) il faut interpréter le xml de retour :: 

81# <record_diagnostic status="Success"> 

82# <doi>10.5802/alco.21</doi> 

83# <msg>Successfully updated</msg> 

84# C'est ce qui est retenu (dans checkDOIBatch). 

85# 

86 

87 

88def recordDOI(resource, testing=False): 

89 """ 

90 @param resource: 

91 @param testing: Boolean set to True when testing 

92 @return: data {status: 200 ou 400, 'message': msg} 

93 """ 

94 

95 doibatch = get_doibatch(resource) 

96 if doibatch: 96 ↛ 99line 96 didn't jump to line 99 because the condition on line 96 was always true

97 doibatch.delete() 

98 

99 doibatch = DOIBatch(resource=resource, status="En cours") 

100 doibatch.save() 

101 context = {} 

102 context["doi_batch_id"] = f"{doibatch.pk:04d}" 

103 # https://data.crossref.org/reports/help/schema_doc/4.4.2/schema_4_4_2.html#timestamp 

104 timestamp = datetime.now().strftime("%Y%m%d%H%M%S%f") # len = 20, must be 19 

105 context["timestamp"] = timestamp[0:19] 

106 context["mail"] = settings.CROSSREF_MAIL 

107 template = f"crossref/{resource.classname.lower()}_doi_register.xml" 

108 crossref_user = None 

109 crossref_pwd = None 

110 

111 # hack pour déterminer la date de publication pour une resource 

112 if resource.classname == "Article": 112 ↛ 155line 112 didn't jump to line 155 because the condition on line 112 was always true

113 # si un article n'a pas de contributeurs, on enregistre un posted-content de type other 

114 # https://data.crossref.org/reports/help/schema_doc/4.4.2/schema_4_4_2.html#posted_content 

115 if not resource.get_author_contributions() and resource.classname == "Article": 115 ↛ 116line 115 didn't jump to line 116 because the condition on line 115 was never true

116 template = "crossref/posted-content.xml" 

117 

118 # on est en présence d'un objet qui a besoin d'une date de publication 

119 if not resource.date_published and not resource.date_online_first: 119 ↛ 121line 119 didn't jump to line 121 because the condition on line 119 was never true

120 # on extrapole la date du volume 

121 date = resource.my_container.year 

122 try: 

123 date = datetime.strptime(date, "%Y") 

124 resource.DOIdate = "<year>%s</year>" % resource.my_container.year 

125 except ValueError: 

126 # on suppose que la date est du format 2010-2011, on garde la 2eme année du range 

127 year = resource.my_container.year.split("-")[1] 

128 resource.DOIdate = "<year>%s</year>" % year 

129 resource.my_container.year = year 

130 else: 

131 # on renseigne la date selon le format voulu par CROSSREF 

132 if resource.date_published: 132 ↛ 133line 132 didn't jump to line 133 because the condition on line 132 was never true

133 resource.DOIdate = resource.date_published.strftime( 

134 "<month>%m</month><day>%d</day><year>%Y</year>" 

135 ) 

136 

137 # on check aussi la date du container 

138 date = resource.my_container.year 

139 try: 

140 date = datetime.strptime(date, "%Y") 

141 except ValueError: 

142 # on suppose que la date est du format 2010-2011, on garde la 2eme année du range 

143 year = resource.my_container.year.split("-")[1] 

144 resource.my_container.year = year 

145 else: 

146 # Online First 

147 # TODO: Is it possible to send 2 dates to Crossref ? 

148 # You can send multiple <publication_date> but it is for multiple media_type (print vs online) 

149 resource.DOIdate = resource.date_online_first.strftime( 

150 "<month>%m</month><day>%d</day><year>%Y</year>" 

151 ) 

152 

153 # Le year du container vaut '0' 

154 

155 elif resource.classname == "Container": 

156 if not resource.doi: 

157 return {"message": "Erreur, le numéro n'a pas de doi."} 

158 if resource.ctype.startswith("book"): 

159 # PS : pas de gestion des chapitres pour les livres, tout est fait dans le template au moment de l'enregistrement du book 

160 # template en fct du ctype 

161 if resource.my_collection.issn or resource.my_collection.e_issn: 

162 template = "crossref/book_series_metadata.xml" 

163 else: 

164 template = "crossref/book_set_metadata.xml" 

165 # else #book tout seul n'appartenant pas à une série 

166 # template = book_metadata 

167 context["book_type"] = resource.ctype[5:].replace("-", "_") 

168 for bookpart in resource.article_set.all(): 

169 doibatch = get_doibatch(bookpart) 

170 if doibatch: 

171 doibatch.delete() 

172 doibatch = DOIBatch(resource=bookpart, status="En cours") 

173 doibatch.save() 

174 

175 elif resource.ctype.startswith("issue_special"): 

176 template = "crossref/issue_doi_register.xml" 

177 title = resource.title_html 

178 context["title"] = title 

179 

180 date = resource.year 

181 try: 

182 date = datetime.strptime(date, "%Y") 

183 resource.DOIdate = "<year>%s</year>" % resource.year 

184 except ValueError: 

185 # on suppose que la date est du format 2010-2011, on garde la 2eme année du range 

186 year = resource.year.split("-")[1] 

187 resource.DOIdate = "<year>%s</year>" % year 

188 

189 elif resource.classname == "TranslatedArticle": 

190 with open( 

191 os.path.join(settings.LOG_DIR, "record_doi.log"), "a", encoding="utf-8" 

192 ) as file_: 

193 file_.write(resource.doi + "\n") 

194 

195 resource.DOIdate = resource.date_published.strftime( 

196 "<month>%m</month><day>%d</day><year>%Y</year>" 

197 ) 

198 context["collection"] = resource.original_article.get_top_collection() 

199 

200 context["resource"] = resource 

201 

202 preprint_id = preprint_type = None 

203 qs = resource.extid_set.filter(id_type="preprint") 

204 if qs: 204 ↛ 205line 204 didn't jump to line 205 because the condition on line 204 was never true

205 extid = qs.first() 

206 preprint_id = extid.id_value 

207 preprint_type = find_id_type(preprint_id) 

208 # crossref allows "doi" and "arxiv", but not "hal" 

209 if preprint_type == "hal": 

210 preprint_type = "other" 

211 context["preprint_id"] = preprint_id 

212 context["preprint_type"] = preprint_type 

213 abstract = resource.abstract_set.filter(lang=resource.lang, tag="abstract").first() 

214 if abstract: 214 ↛ 218line 214 didn't jump to line 218 because the condition on line 214 was always true

215 context["abstract"] = xml_utils.get_crossref_jats_from_xml_with_formula( 

216 abstract.value_xml, with_mathml=True 

217 ) 

218 if resource.license: 218 ↛ 219line 218 didn't jump to line 219 because the condition on line 218 was never true

219 context["license"] = search_license(resource) 

220 

221 rdoi = None 

222 qs = resource.extid_set.filter(id_type="rdoi") 

223 if qs: 223 ↛ 224line 223 didn't jump to line 224 because the condition on line 223 was never true

224 rdoi = qs.first().id_value 

225 context["rdoi"] = rdoi 

226 

227 try: 

228 xml = render_to_string(template_name=template, context=context) 

229 doibatch.xml = xml 

230 doibatch.save() 

231 except Exception as e: 

232 if resource.classname == "TranslatedArticle": 

233 with open( 

234 os.path.join(settings.LOG_DIR, "record_doi.log"), "a", encoding="utf-8" 

235 ) as file_: 

236 file_.write(str(e) + "\n") 

237 raise e 

238 

239 files = {"file": (f"{doibatch.pk}.xml", xml)} 

240 

241 data = {"status": 404} 

242 if not testing: 242 ↛ 243line 242 didn't jump to line 243 because the condition on line 242 was never true

243 if resource.classname == "TranslatedArticle": 

244 crossref_user, crossref_pwd = get_user_pwd_crossref(resource.original_article) 

245 

246 with open( 

247 os.path.join(settings.LOG_DIR, "record_doi.log"), "a", encoding="utf-8" 

248 ) as file_: 

249 file_.write("Call crossref\n") 

250 

251 elif resource.classname == "Container" and resource.ctype.startswith("book"): 

252 # pas de doi niveau container, alors pour obtenir les identifiants crossref on part sur le 1er book part 

253 crossref_user, crossref_pwd = get_user_pwd_crossref(resource.article_set.first()) 

254 else: 

255 crossref_user, crossref_pwd = get_user_pwd_crossref(resource) 

256 

257 crossref_batch_url = settings.CROSSREF_BATCHURL_TPL % (crossref_user, crossref_pwd) 

258 

259 r = requests.post(crossref_batch_url, files=files) 

260 body = r.text.encode("utf8") 

261 if r.status_code == 200: 

262 xml = etree.XML(body) 

263 title = xml.xpath("//*/title")[0].text 

264 if title == "SUCCESS": 

265 data["status"] = r.status_code 

266 elif r.status_code == 401: 

267 doibatch.status = "Erreur" 

268 doibatch.log = "Pb d'authentification" 

269 doibatch.save() 

270 else: 

271 doibatch.status = "Erreur" 

272 doibatch.save() 

273 data["message"] = body[:1000].decode("utf-8") 

274 

275 if resource.classname == "TranslatedArticle": 

276 with open( 

277 os.path.join(settings.LOG_DIR, "record_doi.log"), "a", encoding="utf-8" 

278 ) as file_: 

279 file_.write(doibatch.status + "\n") 

280 return data 

281 

282 

283def get_user_pwd_crossref(resource): 

284 # get CROSSREF credentials from DOI prefix 

285 doi = resource.doi 

286 prefix = doi.split("/")[0] 

287 md_prefix = prefix.split(".")[1] 

288 crossref_user_const = "CROSSREF_USER_" + md_prefix 

289 crossref_pwd_const = "CROSSREF_PWD_" + md_prefix 

290 try: 

291 crossref_user = getattr(settings, crossref_user_const) 

292 crossref_pwd = getattr(settings, crossref_pwd_const) 

293 except AttributeError: 

294 crossref_user = settings.CROSSREF_USER_5802 

295 crossref_pwd = settings.CROSSREF_PWD_5802 

296 return crossref_user, crossref_pwd 

297 

298 

299def checkDOIBatch(doibatch): 

300 """ 

301 check DOI batch status by HTTP request 

302 @param doibatch: DOIBatch 

303 @return: DOIBatch with status and log updated 

304 """ 

305 

306 resource = doibatch.resource 

307 crossref_user, crossref_pwd = get_user_pwd_crossref(resource) 

308 url = settings.CROSSREF_BASE_CHECKBATCH_URL_TPL % (crossref_user, crossref_pwd) 

309 url = url.format(doibatch.pk) 

310 r = requests.get(url) 

311 if r.status_code == 200: 

312 # analyse du xml de retour 

313 dataXml = r.text.encode("utf8") 

314 tree = etree.XML(dataXml) 

315 elem = tree.xpath("/doi_batch_diagnostic")[0] 

316 batch_status = elem.attrib["status"] 

317 if batch_status == "completed": 

318 # le batch a été traité 

319 doibatch.status = "batch terminé" 

320 doibatch.log = "Pas de DOI associé dans le batch : voir le xml" 

321 diags = tree.xpath("//*/record_diagnostic") 

322 for diag in diags: 

323 doi = diag.xpath("doi")[0].text 

324 log = diag.xpath("msg")[0].text 

325 status = diag.attrib["status"] 

326 if doi == doibatch.resource.doi: 

327 if status == "Success": 

328 doibatch.status = "Enregistré" 

329 else: 

330 doibatch.status = "Erreur" 

331 else: 

332 doibatch.status = "Erreur" 

333 doibatch.log = log 

334 

335 elif batch_status == "in_process" or batch_status == "queued": 

336 doibatch.status = "En cours" 

337 doibatch.log = "batch en cours de traitement" 

338 else: # rafraichit trop tot apres Record DOI 

339 doibatch.status = "Erreur" 

340 doibatch.log = ( 

341 f"Attention, il se peut qu'il faille rafraichir un peu plus tard {r.text} " 

342 ) 

343 else: 

344 doibatch.status = "Erreur" 

345 doibatch.log = r.text 

346 doibatch.save() 

347 return doibatch 

348 

349 

350def removeOldDataInCrossref(article, testing=False): 

351 """ 

352 The CRAS 2002-2019 articles were registered by Elsevier 

353 To remove some metadata in Crossref, we need to provide a separate XML with the fields to remove 

354 

355 @param article: 

356 @param testing: Boolean set to True when testing 

357 @return: data {status: 200 ou 400, 'message': msg} 

358 """ 

359 

360 doibatch = get_doibatch(article) 

361 if doibatch: 

362 doibatch.delete() 

363 

364 doibatch = DOIBatch(resource=article, status="En cours") 

365 doibatch.save() 

366 

367 context = {"resource": article, "doi_batch_id": f"{doibatch.pk:04d}"} 

368 

369 timestamp = datetime.now().strftime("%Y%m%d%H%M%S%f") # len = 20, must be 19 

370 context["timestamp"] = timestamp[0:19] 

371 

372 context["mail"] = settings.CROSSREF_MAIL 

373 template = "crossref/article_remove_old_data.xml" 

374 

375 if article.date_published: 

376 article.DOIdate = article.date_published.strftime( 

377 "<month>%m</month><day>%d</day><year>%Y</year>" 

378 ) 

379 

380 try: 

381 xml = render_to_string(template_name=template, context=context) 

382 

383 if testing: 

384 print(xml) 

385 

386 doibatch.xml = xml 

387 doibatch.save() 

388 except Exception as e: 

389 raise e 

390 

391 files = {"file": (f"{doibatch.pk}.xml", xml)} 

392 

393 data = {"status": 404} 

394 if not testing: 

395 crossref_user, crossref_pwd = get_user_pwd_crossref(article) 

396 crossref_batch_url = settings.CROSSREF_BATCHURL_TPL % (crossref_user, crossref_pwd) 

397 

398 r = requests.post(crossref_batch_url, files=files) 

399 body = r.text.encode("utf8") 

400 if r.status_code == 200: 

401 xml = etree.XML(body) 

402 title = xml.xpath("//*/title")[0].text 

403 if title == "SUCCESS": 

404 data["status"] = r.status_code 

405 elif r.status_code == 401: 

406 doibatch.status = "Erreur" 

407 doibatch.log = "Pb d'authentification" 

408 doibatch.save() 

409 else: 

410 doibatch.status = "Erreur" 

411 doibatch.save() 

412 data["message"] = body[:1000].decode("utf-8") 

413 

414 return data