Coverage for src/ptf_tools/doi.py: 21%

1import os

2from datetime import datetime

4import requests

5from django.conf import settings

6from django.core.exceptions import ObjectDoesNotExist

7from django.template.loader import render_to_string

8from lxml import etree

9from ptf.cmds.xml import xml_utils

10from ptf.display.resolver import find_id_type

12from mersenne_tools.models import DOIBatch

15def get_doibatch(resource):

16 doibatch = None

17 try:

18 doibatch = resource.doibatch

19 except ObjectDoesNotExist:

20 pass

22 return doibatch

25def get_or_create_doibatch(resource):

26 """

27 @param models.Resource:

28 @return: new or updated doibatch

29 """

31 update_doi = False

32 # check DOI

33 url = settings.DOI_BASE_URL + resource.doi

34 r = requests.get(url, allow_redirects=False)

35 if r.status_code == 302 and resource.get_url_absolute() == r.headers["Location"]:

36 status = "Enregistré"

37 log = "Vérifié sur CROSSREF"

38 update_doi = True

39 elif r.status_code == 302 and resource.get_url_absolute() != r.headers["Location"]:

40 status = "Erreur"

41 log = "Mauvaise URL pour le DOI !!!/à réenregistrer"

42 update_doi = True

43 doibatch = get_doibatch(resource)

44 if update_doi:

45 if doibatch:

46 doibatch.status = status

47 doibatch.log = log

48 else:

49 doibatch = DOIBatch(resource=resource, status=status, log=log)

50 doibatch.save()

51 return doibatch

53 # si on est dans le cas d'un book-part vu que l'enregistrement se fait niveau container, on ne peut pas interroger le batch

54 # lié au book-part, car il a été créé juste pour afficher "En cours" sur le niveau book-part

55 if (

56 doibatch

57 and resource.classname == "Article"

58 and resource.my_container.ctype.startswith("book")

59 ):

60 doibatch.delete()

61 doibatch = None

63 if doibatch:

64 doibatch = checkDOIBatch(doibatch)

66 return doibatch

69# recordDOI par resource (article)

70# problématique liée à l'enregistrement des DOI chez CROSSREF :

71# - pour enregistrer un DOI, on utilise le DOI du journal comme référence : CROSSREF prend ça comme une demande d'enregistrement/modification !

72# du DOI du journal...

73# ce qui se passe lorsque l'on envoie plusieurs requêtes les unes à la suite des autres (Record all DOIs), c'est que l'ordre de traitement est

74# différent (aléatoire) de l'ordre d'envoi et on obtient ces erreurs :

75# "Record not processed because submitted version: 201810150907372216 is less or equal to previously submitted version {1}"

76# ( MAIS le record impliqué ici est celui du journal, celui de l'article ne pose globalement pas de pb)

77# car il y a un timestamp dans chaque requête

78#

79# pour contrer ces erreurs (avant on ne diagnostiquait que le nombre de failure_count et donc il y en avait une) il faut interpréter le xml de retour ::

80# <record_diagnostic status="Success">

81# <doi>10.5802/alco.21</doi>

82# <msg>Successfully updated</msg>

83# C'est ce qui est retenu (dans checkDOIBatch).

84#

87def recordDOI(resource, testing=False):

88 """

89 @param resource:

90 @param testing: Boolean set to True when testing

91 @return: data {status: 200 ou 400, 'message': msg}

92 """

94 doibatch = get_doibatch(resource)

95 if doibatch: 95 ↛ 98line 95 didn't jump to line 98 because the condition on line 95 was always true

96 doibatch.delete()

98 doibatch = DOIBatch(resource=resource, status="En cours")

99 doibatch.save()

100 context = {}

101 context["doi_batch_id"] = f"{doibatch.pk:04d}"

102 # https://data.crossref.org/reports/help/schema_doc/4.4.2/schema_4_4_2.html#timestamp

103 timestamp = datetime.now().strftime("%Y%m%d%H%M%S%f") # len = 20, must be 19

104 context["timestamp"] = timestamp[0:19]

105 context["mail"] = settings.CROSSREF_MAIL

106 template = f"crossref/{resource.classname.lower()}_doi_register.xml"

107 crossref_user = None

108 crossref_pwd = None

109

110 # hack pour déterminer la date de publication pour une resource

111 if resource.classname == "Article": 111 ↛ 154line 111 didn't jump to line 154 because the condition on line 111 was always true

112 # si un article n'a pas de contributeurs, on enregistre un posted-content de type other

113 # https://data.crossref.org/reports/help/schema_doc/4.4.2/schema_4_4_2.html#posted_content

114 if not resource.get_author_contributions() and resource.classname == "Article": 114 ↛ 115line 114 didn't jump to line 115 because the condition on line 114 was never true

115 template = "crossref/posted-content.xml"

116

117 # on est en présence d'un objet qui a besoin d'une date de publication

118 if not resource.date_published and not resource.date_online_first: 118 ↛ 120line 118 didn't jump to line 120 because the condition on line 118 was never true

119 # on extrapole la date du volume

120 date = resource.my_container.year

121 try:

122 date = datetime.strptime(date, "%Y")

123 resource.DOIdate = "<year>%s</year>" % resource.my_container.year

124 except ValueError:

125 # on suppose que la date est du format 2010-2011, on garde la 2eme année du range

126 year = resource.my_container.year.split("-")[1]

127 resource.DOIdate = "<year>%s</year>" % year

128 resource.my_container.year = year

129 else:

130 # on renseigne la date selon le format voulu par CROSSREF

131 if resource.date_published: 131 ↛ 132line 131 didn't jump to line 132 because the condition on line 131 was never true

132 resource.DOIdate = resource.date_published.strftime(

133 "<month>%m</month><day>%d</day><year>%Y</year>"

134 )

135

136 # on check aussi la date du container

137 date = resource.my_container.year

138 try:

139 date = datetime.strptime(date, "%Y")

140 except ValueError:

141 # on suppose que la date est du format 2010-2011, on garde la 2eme année du range

142 year = resource.my_container.year.split("-")[1]

143 resource.my_container.year = year

144 else:

145 # Online First

146 # TODO: Is it possible to send 2 dates to Crossref ?

147 # You can send multiple <publication_date> but it is for multiple media_type (print vs online)

148 resource.DOIdate = resource.date_online_first.strftime(

149 "<month>%m</month><day>%d</day><year>%Y</year>"

150 )

151

152 # Le year du container vaut '0'

153

154 elif resource.classname == "Container":

155 if not resource.doi:

156 return {"message": "Erreur, le numéro n'a pas de doi."}

157 if resource.ctype.startswith("book"):

158 # PS : pas de gestion des chapitres pour les livres, tout est fait dans le template au moment de l'enregistrement du book

159 # template en fct du ctype

160 if resource.my_collection.issn or resource.my_collection.e_issn:

161 template = "crossref/book_series_metadata.xml"

162 else:

163 template = "crossref/book_set_metadata.xml"

164 # else #book tout seul n'appartenant pas à une série

165 # template = book_metadata

166 context["book_type"] = resource.ctype[5:].replace("-", "_")

167 for bookpart in resource.article_set.all():

168 doibatch = get_doibatch(bookpart)

169 if doibatch:

170 doibatch.delete()

171 doibatch = DOIBatch(resource=bookpart, status="En cours")

172 doibatch.save()

173

174 elif resource.ctype.startswith("issue_special"):

175 template = "crossref/issue_doi_register.xml"

176 title = resource.title_html

177 context["title"] = title

178

179 date = resource.year

180 try:

181 date = datetime.strptime(date, "%Y")

182 resource.DOIdate = "<year>%s</year>" % resource.year

183 except ValueError:

184 # on suppose que la date est du format 2010-2011, on garde la 2eme année du range

185 year = resource.year.split("-")[1]

186 resource.DOIdate = "<year>%s</year>" % year

187

188 elif resource.classname == "TranslatedArticle":

189 with open(

190 os.path.join(settings.LOG_DIR, "record_doi.log"), "a", encoding="utf-8"

191 ) as file_:

192 file_.write(resource.doi + "\n")

193

194 resource.DOIdate = resource.date_published.strftime(

195 "<month>%m</month><day>%d</day><year>%Y</year>"

196 )

197 context["collection"] = resource.original_article.get_top_collection()

198

199 context["resource"] = resource

200

201 preprint_id = preprint_type = None

202 qs = resource.extid_set.filter(id_type="preprint")

203 if qs: 203 ↛ 204line 203 didn't jump to line 204 because the condition on line 203 was never true

204 extid = qs.first()

205 preprint_id = extid.id_value

206 preprint_type = find_id_type(preprint_id)

207 # crossref allows "doi" and "arxiv", but not "hal"

208 if preprint_type == "hal":

209 preprint_type = "other"

210 context["preprint_id"] = preprint_id

211 context["preprint_type"] = preprint_type

212 abstract = resource.abstract_set.filter(lang=resource.lang, tag="abstract").first()

213 if abstract: 213 ↛ 218line 213 didn't jump to line 218 because the condition on line 213 was always true

214 context["abstract"] = xml_utils.get_jats_from_xml_with_formula(

215 abstract.value_xml, with_mathml=True

216 )

217

218 rdoi = None

219 qs = resource.extid_set.filter(id_type="rdoi")

220 if qs: 220 ↛ 221line 220 didn't jump to line 221 because the condition on line 220 was never true

221 rdoi = qs.first().id_value

222 context["rdoi"] = rdoi

223

224 try:

225 xml = render_to_string(template_name=template, context=context)

226 doibatch.xml = xml

227 doibatch.save()

228 except Exception as e:

229 if resource.classname == "TranslatedArticle":

230 with open(

231 os.path.join(settings.LOG_DIR, "record_doi.log"), "a", encoding="utf-8"

232 ) as file_:

233 file_.write(str(e) + "\n")

234 raise e

235

236 files = {"file": (f"{doibatch.pk}.xml", xml)}

237

238 data = {"status": 404}

239 if not testing: 239 ↛ 240line 239 didn't jump to line 240 because the condition on line 239 was never true

240 if resource.classname == "TranslatedArticle":

241 crossref_user, crossref_pwd = get_user_pwd_crossref(resource.original_article)

242

243 with open(

244 os.path.join(settings.LOG_DIR, "record_doi.log"), "a", encoding="utf-8"

245 ) as file_:

246 file_.write("Call crossref\n")

247

248 elif resource.classname == "Container" and resource.ctype.startswith("book"):

249 # pas de doi niveau container, alors pour obtenir les identifiants crossref on part sur le 1er book part

250 crossref_user, crossref_pwd = get_user_pwd_crossref(resource.article_set.first())

251 else:

252 crossref_user, crossref_pwd = get_user_pwd_crossref(resource)

253

254 crossref_batch_url = settings.CROSSREF_BATCHURL_TPL % (crossref_user, crossref_pwd)

255

256 r = requests.post(crossref_batch_url, files=files)

257 body = r.text.encode("utf8")

258 if r.status_code == 200:

259 xml = etree.XML(body)

260 title = xml.xpath("//*/title")[0].text

261 if title == "SUCCESS":

262 data["status"] = r.status_code

263 elif r.status_code == 401:

264 doibatch.status = "Erreur"

265 doibatch.log = "Pb d'authentification"

266 doibatch.save()

267 else:

268 doibatch.status = "Erreur"

269 doibatch.save()

270 data["message"] = body[:1000].decode("utf-8")

271

272 if resource.classname == "TranslatedArticle":

273 with open(

274 os.path.join(settings.LOG_DIR, "record_doi.log"), "a", encoding="utf-8"

275 ) as file_:

276 file_.write(doibatch.status + "\n")

277 return data

278

279

280def get_user_pwd_crossref(resource):

281 # get CROSSREF credentials from DOI prefix

282 doi = resource.doi

283 prefix = doi.split("/")[0]

284 md_prefix = prefix.split(".")[1]

285 crossref_user_const = "CROSSREF_USER_" + md_prefix

286 crossref_pwd_const = "CROSSREF_PWD_" + md_prefix

287 try:

288 crossref_user = getattr(settings, crossref_user_const)

289 crossref_pwd = getattr(settings, crossref_pwd_const)

290 except AttributeError:

291 crossref_user = settings.CROSSREF_USER_5802

292 crossref_pwd = settings.CROSSREF_PWD_5802

293 return crossref_user, crossref_pwd

294

295

296def checkDOIBatch(doibatch):

297 """

298 check DOI batch status by HTTP request

299 @param doibatch: DOIBatch

300 @return: DOIBatch with status and log updated

301 """

302

303 resource = doibatch.resource

304 crossref_user, crossref_pwd = get_user_pwd_crossref(resource)

305 url = settings.CROSSREF_BASE_CHECKBATCH_URL_TPL % (crossref_user, crossref_pwd)

306 url = url.format(doibatch.pk)

307 r = requests.get(url)

308 if r.status_code == 200:

309 # analyse du xml de retour

310 dataXml = r.text.encode("utf8")

311 tree = etree.XML(dataXml)

312 elem = tree.xpath("/doi_batch_diagnostic")[0]

313 batch_status = elem.attrib["status"]

314 if batch_status == "completed":

315 # le batch a été traité

316 doibatch.status = "batch terminé"

317 doibatch.log = "Pas de DOI associé dans le batch : voir le xml"

318 diags = tree.xpath("//*/record_diagnostic")

319 for diag in diags:

320 doi = diag.xpath("doi")[0].text

321 log = diag.xpath("msg")[0].text

322 status = diag.attrib["status"]

323 if doi == doibatch.resource.doi:

324 if status == "Success":

325 doibatch.status = "Enregistré"

326 else:

327 doibatch.status = "Erreur"

328 else:

329 doibatch.status = "Erreur"

330 doibatch.log = log

331

332 elif batch_status == "in_process" or batch_status == "queued":

333 doibatch.status = "En cours"

334 doibatch.log = "batch en cours de traitement"

335 else: # rafraichit trop tot apres Record DOI

336 doibatch.status = "Erreur"

337 doibatch.log = (

338 "Attention, il se peut qu'il faille rafraichir un peu plus tard {} ".format(r.text)

339 )

340 else:

341 doibatch.status = "Erreur"

342 doibatch.log = r.text

343 doibatch.save()

344 return doibatch

345

346

347def removeOldDataInCrossref(article, testing=False):

348 """

349 The CRAS 2002-2019 articles were registered by Elsevier

350 To remove some metadata in Crossref, we need to provide a separate XML with the fields to remove

351

352 @param article:

353 @param testing: Boolean set to True when testing

354 @return: data {status: 200 ou 400, 'message': msg}

355 """

356

357 doibatch = get_doibatch(article)

358 if doibatch:

359 doibatch.delete()

360

361 doibatch = DOIBatch(resource=article, status="En cours")

362 doibatch.save()

363

364 context = {"resource": article, "doi_batch_id": f"{doibatch.pk:04d}"}

365

366 timestamp = datetime.now().strftime("%Y%m%d%H%M%S%f") # len = 20, must be 19

367 context["timestamp"] = timestamp[0:19]

368

369 context["mail"] = settings.CROSSREF_MAIL

370 template = "crossref/article_remove_old_data.xml"

371

372 if article.date_published:

373 article.DOIdate = article.date_published.strftime(

374 "<month>%m</month><day>%d</day><year>%Y</year>"

375 )

376

377 try:

378 xml = render_to_string(template_name=template, context=context)

379

380 if testing:

381 print(xml)

382

383 doibatch.xml = xml

384 doibatch.save()

385 except Exception as e:

386 raise e

387

388 files = {"file": (f"{doibatch.pk}.xml", xml)}

389

390 data = {"status": 404}

391 if not testing:

392 crossref_user, crossref_pwd = get_user_pwd_crossref(article)

393 crossref_batch_url = settings.CROSSREF_BATCHURL_TPL % (crossref_user, crossref_pwd)

394

395 r = requests.post(crossref_batch_url, files=files)

396 body = r.text.encode("utf8")

397 if r.status_code == 200:

398 xml = etree.XML(body)

399 title = xml.xpath("//*/title")[0].text

400 if title == "SUCCESS":

401 data["status"] = r.status_code

402 elif r.status_code == 401:

403 doibatch.status = "Erreur"

404 doibatch.log = "Pb d'authentification"

405 doibatch.save()

406 else:

407 doibatch.status = "Erreur"

408 doibatch.save()

409 data["message"] = body[:1000].decode("utf-8")

410

411 return data