forked from 4training/pywikitools
-
Notifications
You must be signed in to change notification settings - Fork 0
/
mediawiki2drupal.py
217 lines (194 loc) · 10.4 KB
/
mediawiki2drupal.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
"""
Provides a bridge from mediawiki to Drupal
Reads a page as HTML using the mediawiki API and imports it as a node to the Drupal system.
The parsed HTML gets processed with BeautifulSoup to remove some unwanted pieces
(like the "edit this section" links or the list of available translations of this page)
Afterwards it writes the page to Drupal by using the JSON:API
For this, the Web Services related modules which are part of Drupal core need to be enabled:
HAL, HTTP Basic Auth, JSON:API, RESTful Web Services, Serialization
Inspired by:
https://weimingchenzero.medium.com/use-python-to-call-drupal-9-core-restful-api-to-create-new-content-9f3fa8628ab4
This uses HTTP Basic Auth. Make sure the communication with the endpoint is encrypted via https!
TODO: improving security by supporting OAuth2
Configuration with credentials is in the [mediawiki2drupal] part of config.ini (see config.example.ini)
"""
import requests
import os
import logging
from typing import Final, Dict
from bs4 import BeautifulSoup, Comment
import configparser
from pywikitools.fortraininglib import ForTrainingLib
class Mediawiki2Drupal():
"""The main class containing all major functionality to import pages from mediawiki into Drupal"""
HEADERS: Final[Dict[str, str]] = {
'Accept': 'application/vnd.api+json',
'Content-Type': 'application/vnd.api+json'
}
def __init__(self, fortraininglib: ForTrainingLib, endpoint: str, username: str, password: str,
content_type: str = "page", change_hrefs: Dict[str, str] = None,
img_src_rewrite: Dict[str, str] = None):
"""
@param content_type refers to the Drupal content type that we should create articles with.
This needs to be the system name of a content type
("page" is the system name of the "basic page" content type, one of the defaults in Drupal)
@param change_hrefs: rewrite <a href=""> properties
@param img_src_rewrite: dictionary with new href sources
"""
self._endpoint = endpoint
self._username = username
self._password = password
self._content_type = content_type
self._change_hrefs = change_hrefs
self._img_src_rewrite = img_src_rewrite
self.fortraininglib: Final[ForTrainingLib] = fortraininglib
self.logger: logging.Logger = logging.getLogger('pywikitools.mediawiki2drupal')
def _process_html(self, input: str, custom_fields: Dict[str, str] = None) -> str:
"""
TODO Start using pywikitools.lib.html.BeautifyHTML
TODO Subclass BeautifyHTML and overwrite image_rewrite_handler to add customizations for "hands" images
Take the original HTML coming from mediawiki and remove unnecessary tags or attributes.
If we would request the English originals like fortraininglib.get_page_html("Prayer"),
we would need to remove the [edit] sections. But as we request them with
fortraininglib.get_page_html("Prayer/en"), we don't have to take care of that anymore
"""
soup = BeautifulSoup(input, 'html.parser')
soup.div.unwrap() # Remove enclosing <div class="mw_parser_output">...</div>
# Remove the language overview
for element in soup.find_all(class_="noprint"):
element.decompose()
# Removing comments
for child in soup.children:
if isinstance(child, Comment):
child.extract()
# Changing <h2><span class="mw-headline" id="Headline">Headline</span></h2>
# to <h2>Headline</h2>
# TODO: do we need the id tag again to be able to set internal links?
for element in soup.find_all("span", class_="mw-headline"):
element.unwrap()
# Remove empty <span> tags (not sure why they're even there)
for element in soup.find_all("span"):
if element.string is None:
element.extract()
# Correct image hrefs
for element in soup.find_all("img"):
del element['srcset']
img_src = str(element['src'])
last_slash = img_src.rfind('/')
if last_slash >= 0:
img_src = img_src[last_slash+1:]
if (self._img_src_rewrite is not None) and (img_src in self._img_src_rewrite):
self.logger.info(f"Replacing img src {element['src']} with {self._img_src_rewrite[img_src]}")
element['src'] = self._img_src_rewrite[img_src]
if img_src.startswith('30px-Hand'): # some customizations for the five "hands" images in God's Story
del element['height']
del element['width']
element['style'] = 'height:80px; margin-right:20px'
element['align'] = 'left'
else:
self.logger.warning(f"Missing img src replacement for {img_src}")
for element in soup.find_all("a", href=True):
if element['href'].startswith("/File:"):
# Remove <a> links around <img> tags
element.unwrap()
continue
# Rewrite hrefs
if self._change_hrefs is not None:
if element['href'] in self._change_hrefs:
self.logger.info(f"Rewriting a href source {element['href']} "
f"with {self._change_hrefs[element['href']]}")
element['href'] = self._change_hrefs[element['href']]
else:
self.logger.warning(f"Couldn't find href rewrite for destination {element['href']}")
del element['title']
return str(soup)
def get_page_id(self, search_criteria: Dict[str, str]):
"""
Search for a page where the given field matches the given value
If at least one page exists, return the ID of the first page
This will issue a warning if more than one page was found
@param search_criteria: at least one entry of field_name -> value
Example: get_page_id({"title": "Gebet"}) will call /jsonapi/node/page?filter[title][value]=Gebet
@return None in case no matching page was found
"""
payload = {}
for field_name, value in search_criteria.items():
payload[f"filter[{field_name}][value]"] = value
r = requests.get(f"{self._endpoint}/node/{self._content_type}",
auth=(self._username, self._password), params=payload)
if "data" not in r.json():
return None
if not isinstance(r.json()["data"], list):
return None
if len(r.json()["data"]) == 0:
return None
if len(r.json()["data"]) > 1:
self.logger.warning(f"Found more than one page with search criteria {field_name}={value}.")
return r.json()["data"][0]["id"]
def import_page(self, page: str, language_code: str, article_id: int = None,
custom_fields: Dict[str, str] = None) -> bool:
"""
Request the translated page and import it to Drupal
@param article_id if given, try to patch this existing node. If None, create new node
@param custom_fields allows to set more fields of the content type to custom values
@return False on error
"""
title = self.fortraininglib.get_translated_title(page, language_code)
if title is None:
self.logger.warning(f"Importing page failed: Couldn't get translated title of page {page}.")
return False
content = self.fortraininglib.get_page_html(f"{page}/{language_code}")
if content is None:
self.logger.warning(f"Importing page failed: Couldn't get content of page {page}/{language_code}")
return False
payload = {
"data": {
"type": f"node--{self._content_type}",
"attributes": {
"title": title,
"body": {
"value": self._process_html(content, custom_fields),
"format": "full_html"
}
}
}
}
payload["data"]["attributes"].update(custom_fields)
self.logger.debug(payload)
if article_id is None:
# Create new article
r = requests.post(f"{self._endpoint}/node/{self._content_type}",
headers=self.HEADERS, auth=(self._username, self._password), json=payload)
self.logger.debug(r.status_code)
self.logger.debug(r.json())
else:
# Update existing article
payload["data"]["id"] = article_id
r = requests.patch(f"{self._endpoint}/node/{self._content_type}/{article_id}",
headers=self.HEADERS, auth=(self._username, self._password), json=payload)
self.logger.debug(r.status_code)
self.logger.debug(r.json())
if r.status_code not in [200, 201]:
error = 'No error details given.'
if "errors" in r.json() and isinstance(r.json()["errors"], list):
if "title" in r.json()["errors"][0]:
error = r.json()["errors"][0]["title"]
if "detail" in r.json()["errors"][0]:
error += f". Details: {r.json()['errors'][0]['detail']}"
self.logger.warning(f"Failed to import page {page}/{language_code}. {error}")
return r.status_code in [200, 201]
if __name__ == "__main__":
# Read the configuration from config.ini in the same directory
config = configparser.ConfigParser()
config.read(os.path.dirname(os.path.abspath(__file__)) + '/config.ini')
if config.has_option("mediawiki", "baseurl") and config.has_option("mediawiki", "scriptpath") and \
config.has_option("mediawiki2drupal", "endpoint") and \
config.has_option("mediawiki2drupal", "username") and config.has_option("mediawiki2drupal", "password"):
fortraininglib = ForTrainingLib(config.get("mediawiki", "baseurl"), config.get("mediawiki", "scriptpath"))
mediawiki2drupal = Mediawiki2Drupal(fortraininglib, config.get("mediawiki2drupal", "endpoint"),
config.get("mediawiki2drupal", "username"),
config.get("mediawiki2drupal", "password"))
# TODO: Read parameters from command line
mediawiki2drupal.import_page("Prayer", "de")
else:
print("Configuration in config.ini missing. Aborting now")