MedicineSearch/import_pbs_xml.py

168 lines
8.9 KiB
Python

# Copyright © 2023 Lee Yingtong Li (RunasSudo)
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
import sqlite3
import zipfile
from xml.etree import ElementTree as ET
# Open database
con = sqlite3.connect('database.db')
cur = con.cursor()
# Init schema
cur.execute('DROP TABLE IF EXISTS pbs_item')
cur.execute('CREATE TABLE pbs_item (code TEXT PRIMARY KEY, mpp_preferred_term TEXT, mp_preferred_term TEXT, maximum_prescribable_units INTEGER, number_repeats INTEGER, benefit_type TEXT)')
cur.execute('DROP TABLE IF EXISTS pbs_item_restriction')
cur.execute('CREATE TABLE pbs_item_restriction (item_code TEXT, restriction_code INTEGER)')
cur.execute('DROP TABLE IF EXISTS pbs_restriction')
cur.execute('CREATE TABLE pbs_restriction (code INTEGER PRIMARY KEY, treatment_of INTEGER, indication TEXT, criteria_operator TEXT)')
cur.execute('DROP TABLE IF EXISTS pbs_restriction_criteria')
cur.execute('CREATE TABLE pbs_restriction_criteria (restriction_code INTEGER, criteria_code INTEGER)')
cur.execute('DROP TABLE IF EXISTS pbs_criteria')
cur.execute('CREATE TABLE pbs_criteria (code INTEGER PRIMARY KEY, type TEXT, parameters_operator TEXT)')
cur.execute('DROP TABLE IF EXISTS pbs_criteria_parameter')
cur.execute('CREATE TABLE pbs_criteria_parameter (id INTEGER PRIMARY KEY AUTOINCREMENT, criteria_code INTEGER, text TEXT)')
# Parse XML
with zipfile.ZipFile('data/2023-01-01-xml-V3.zip', 'r') as zipf:
with zipf.open('sch-2023-01-01-r1.xml', 'r') as f:
tree = ET.parse(f)
print('Parsed XML')
root = tree.getroot()
ns = {'pbs': 'http://schema.pbs.gov.au/', 'xlink': 'http://www.w3.org/1999/xlink', 'xml': 'http://www.w3.org/XML/1998/namespace', 'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#', 'dbk': 'http://docbook.org/ns/docbook'}
# Get General Schedule
program = next(p for p in root.find('pbs:schedule', ns).findall('pbs:program', ns) if p.find('pbs:info', ns).find('pbs:code', ns).text == 'GE')
restrictions_to_parse = set()
criteria_to_parse = set()
# Get schedule items (prescribing-rule) in schedule
for item in program.findall('pbs:prescribing-rule', ns):
code = item.find('pbs:code', ns).text
# Only get benefits available to medical practitioners
benefits = [b for b in item.find('pbs:benefit-types-list', ns).findall('pbs:benefit-type', ns) if b.find('pbs:member-of-list', ns).find('pbs:member-of[@rdf:resource="http://pbs.gov.au/prescriber/medical"]', ns)]
if not benefits:
continue
assert len(benefits) == 1
benefit = benefits[0]
mpp_id = item.find('pbs:ready-prepared', ns).find('pbs:mpp-reference', ns).get('{http://www.w3.org/1999/xlink}href').lstrip('#')
mpp = root.find('pbs:drugs-list', ns).find('pbs:mpp[@xml:id="' + mpp_id + '"]', ns)
mpp_preferred_term = mpp.find('pbs:preferred-term', ns).text
mp_id = mpp.find('pbs:drug-references-list', ns).find('pbs:mp-reference', ns).get('{http://www.w3.org/1999/xlink}href').lstrip('#')
mp = root.find('pbs:drugs-list', ns).find('pbs:mp[@xml:id="' + mp_id + '"]', ns)
mp_preferred_term = mp.find('pbs:preferred-term[@rdf:resource="http://pbs.gov.au/clinical"]', ns).text
max_units = item.find('pbs:ready-prepared', ns).find('pbs:maximum-prescribable[@rdf:resource="http://pbs.gov.au/reference/unit-of-use"]', ns).find('pbs:value', ns).text
max_repeats = item.find('pbs:ready-prepared', ns).find('pbs:number-repeats', ns).find('pbs:value', ns).text
benefit_type = {
'http://pbs.gov.au/benefit-type/unrestricted': 'unrestricted',
'http://pbs.gov.au/benefit-type/restricted': 'restricted',
'http://pbs.gov.au/benefit-type/streamlined': 'streamlined',
'http://pbs.gov.au/benefit-type/authority-required': 'authority',
}[benefit.get('{http://www.w3.org/1999/02/22-rdf-syntax-ns#}resource')]
cur.execute('INSERT INTO pbs_item (code, mpp_preferred_term, mp_preferred_term, maximum_prescribable_units, number_repeats, benefit_type) VALUES (?, ?, ?, ?, ?, ?)', (code, mpp_preferred_term, mp_preferred_term, max_units, max_repeats, benefit_type))
# Get restrictions
if restrictions := benefit.find('pbs:restriction-references-list', ns):
for restriction_reference in restrictions.findall('pbs:restriction-reference', ns):
restriction_id = restriction_reference.get('{http://www.w3.org/1999/xlink}href').lstrip('#')
restriction_code = restriction_reference.find('pbs:code', ns).text
cur.execute('INSERT INTO pbs_item_restriction (item_code, restriction_code) VALUES (?, ?)', (code, restriction_code))
# Queue this restriction for parsing
restrictions_to_parse.add(restriction_id)
# Parse restrictions
for restriction_id in sorted(list(restrictions_to_parse)):
restriction = root.find('pbs:prescribing-texts-list', ns).find('pbs:restriction[@xml:id="' + restriction_id + '"]', ns)
code = restriction.find('pbs:code[@rdf:resource="http://pbs.gov.au/code/restriction"]', ns).text
treatment_of = restriction.find('pbs:code[@rdf:resource="http://pbs.gov.au/code/treatment-of"]', ns).text
# Build the name of the indication (episodicity, severity, condition)
indication_id = restriction.find('pbs:indication-reference', ns).get('{http://www.w3.org/1999/xlink}href').lstrip('#')
indication = root.find('pbs:prescribing-texts-list', ns).find('pbs:indication[@xml:id="' + indication_id + '"]', ns)
indication_strings = []
if episodicity_reference := indication.find('pbs:episodicity-reference', ns):
episodicity_id = episodicity_reference.get('{http://www.w3.org/1999/xlink}href').lstrip('#')
episodicity = root.find('pbs:prescribing-texts-list', ns).find('pbs:episodicity[@xml:id="' + episodicity_id + '"]', ns)
episodicity_term = episodicity.find('pbs:preferred-term', ns).text.strip()
indication_strings.append(episodicity_term)
if severity_reference := indication.find('pbs:severity-reference', ns):
severity_id = severity_reference.get('{http://www.w3.org/1999/xlink}href').lstrip('#')
severity = root.find('pbs:prescribing-texts-list', ns).find('pbs:severity[@xml:id="' + severity_id + '"]', ns)
severity_term = severity.find('pbs:preferred-term', ns).text.strip()
indication_strings.append(severity_term)
condition_id = indication.find('pbs:condition-reference', ns).get('{http://www.w3.org/1999/xlink}href').lstrip('#')
condition = root.find('pbs:prescribing-texts-list', ns).find('pbs:condition[@xml:id="' + condition_id + '"]', ns)
condition_term = condition.find('pbs:preferred-term', ns).text.strip()
indication_strings.append(condition_term)
# Get operator and criteria
operators = restriction.findall('pbs:any', ns) + restriction.findall('pbs:all', ns) + restriction.findall('pbs:one-of', ns)
if operators:
operator = operators[0].tag[operators[0].tag.index('}')+1:]
for criteria_reference in operators[0].findall('pbs:criteria-reference', ns):
criteria_id = criteria_reference.get('{http://www.w3.org/1999/xlink}href').lstrip('#')
criteria_code = criteria_reference.find('pbs:code', ns).text
cur.execute('INSERT INTO pbs_restriction_criteria (restriction_code, criteria_code) VALUES (?, ?)', (code, criteria_code))
# Queue this criteria for parsing
criteria_to_parse.add(criteria_id)
else:
operator = None
cur.execute('INSERT INTO pbs_restriction (code, treatment_of, indication, criteria_operator) VALUES (?, ?, ?, ?)', (code, treatment_of, ' '.join(indication_strings), operator))
# Parse criteria
for criteria_id in sorted(list(criteria_to_parse)):
criteria = root.find('pbs:prescribing-texts-list', ns).find('*[@xml:id="' + criteria_id + '"]', ns)
criteria_type = criteria.tag[criteria.tag.index('}')+1:]
code = criteria.find('pbs:code', ns).text
# Get operator and parameters
operators = criteria.findall('pbs:any', ns) + criteria.findall('pbs:all', ns) + criteria.findall('pbs:one-of', ns)
if operators:
operator = operators[0].tag[operators[0].tag.index('}')+1:]
for parameter_reference in operators[0].findall('pbs:parameter-reference', ns):
parameter_id = parameter_reference.get('{http://www.w3.org/1999/xlink}href').lstrip('#')
parameter = root.find('pbs:prescribing-texts-list', ns).find('*[@xml:id="' + parameter_id + '"]', ns)
note_text = '\n'.join(p.text for n in parameter.findall('dbk:note', ns) for p in n.findall('dbk:para', ns))
cur.execute('INSERT INTO pbs_criteria_parameter (criteria_code, text) VALUES (?, ?)', (code, note_text))
cur.execute('INSERT INTO pbs_criteria (code, type, parameters_operator) VALUES (?, ?, ?)', (code, criteria_type, operator))
con.commit()