MedicineSearch/import_pbs_xml.py

282 lines
13 KiB
Python
Executable File

#!/usr/bin/env python3
# Copyright © 2023 Lee Yingtong Li (RunasSudo)
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
import os
import sqlite3
import zipfile
from xml.etree import ElementTree as ET
# Open database
con = sqlite3.connect('database.db')
cur = con.cursor()
# Init schema
cur.execute('DROP TABLE IF EXISTS meta')
cur.execute('CREATE TABLE meta (key TEXT PRIMARY KEY, value TEXT)')
cur.execute('DROP TABLE IF EXISTS pbs_item')
cur.execute('CREATE TABLE pbs_item (code TEXT PRIMARY KEY, mpp_code TEXT, maximum_prescribable_units INTEGER, number_repeats INTEGER, benefit_type TEXT, program TEXT)')
cur.execute('DROP TABLE IF EXISTS pbs_mpp')
cur.execute('CREATE TABLE pbs_mpp (code TEXT PRIMARY KEY, mp_code TEXT, preferred_term TEXT)')
cur.execute('DROP TABLE IF EXISTS pbs_tpp')
cur.execute('CREATE TABLE pbs_tpp (code TEXT PRIMARY KEY, mpp_code TEXT, brand_name TEXT)')
cur.execute('DROP TABLE IF EXISTS pbs_mp')
cur.execute('CREATE TABLE pbs_mp (code TEXT PRIMARY KEY, preferred_term TEXT)')
cur.execute('DROP TABLE IF EXISTS pbs_item_restriction')
cur.execute('CREATE TABLE pbs_item_restriction (item_code TEXT, restriction_code INTEGER)')
cur.execute('DROP TABLE IF EXISTS pbs_restriction')
cur.execute('CREATE TABLE pbs_restriction (code INTEGER PRIMARY KEY, treatment_of INTEGER, indication TEXT, treatment_phase TEXT, criteria_operator TEXT, criteria_rendered TEXT)')
cur.execute('DROP TABLE IF EXISTS pbs_restriction_criteria')
cur.execute('CREATE TABLE pbs_restriction_criteria (restriction_code INTEGER, criteria_code INTEGER)')
cur.execute('DROP TABLE IF EXISTS pbs_criteria')
cur.execute('CREATE TABLE pbs_criteria (code INTEGER PRIMARY KEY, type TEXT, parameters_operator TEXT)')
cur.execute('DROP TABLE IF EXISTS pbs_criteria_parameter')
cur.execute('CREATE TABLE pbs_criteria_parameter (id INTEGER PRIMARY KEY AUTOINCREMENT, criteria_code INTEGER, text TEXT)')
# Parse XML
pbs_zip_file = sorted([f for f in os.listdir('data') if f.endswith('-xml-V3.zip')])[-1]
with zipfile.ZipFile('data/' + pbs_zip_file, 'r') as zipf:
pbs_xml_file = next(f for f in zipf.namelist() if f.endswith('.xml'))
with zipf.open(pbs_xml_file, 'r') as f:
tree = ET.parse(f)
print('Parsed XML')
root = tree.getroot()
ns = {'pbs': 'http://schema.pbs.gov.au/', 'xlink': 'http://www.w3.org/1999/xlink', 'xml': 'http://www.w3.org/XML/1998/namespace', 'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#', 'dbk': 'http://docbook.org/ns/docbook', 'dct': 'http://purl.org/dc/terms/'}
# Write meta
cur.execute('INSERT INTO meta (key, value) VALUES (?, ?)', ('pbs_date', root.find('pbs:info', ns).find('dct:valid', ns).text))
# -----------------------------------------
# Parse items from each desired PBS program
mpps_to_parse = set()
tpps_to_parse = set()
mps_to_parse = set()
restrictions_to_parse = set()
criteria_to_parse = set()
def parse_program(program_code):
# Get program
program = next(p for p in root.find('pbs:schedule', ns).findall('pbs:program', ns) if p.find('pbs:info', ns).find('pbs:code', ns).text == program_code)
# Get schedule items (prescribing-rule) in schedule
for item in program.findall('pbs:prescribing-rule', ns):
code = item.find('pbs:code', ns).text
# Only get benefits available to medical practitioners
benefits = [b for b in item.find('pbs:benefit-types-list', ns).findall('pbs:benefit-type', ns) if b.find('pbs:member-of-list', ns).find('pbs:member-of[@rdf:resource="http://pbs.gov.au/prescriber/medical"]', ns)]
if not benefits:
continue
assert len(benefits) == 1
benefit = benefits[0]
mpp_id = item.find('pbs:ready-prepared', ns).find('pbs:mpp-reference', ns).get('{http://www.w3.org/1999/xlink}href').lstrip('#')
mpp_code = item.find('pbs:ready-prepared', ns).find('pbs:mpp-reference', ns).find('pbs:code', ns).text
max_units = item.find('pbs:ready-prepared', ns).find('pbs:maximum-prescribable[@rdf:resource="http://pbs.gov.au/reference/unit-of-use"]', ns).find('pbs:value', ns).text
max_repeats = item.find('pbs:ready-prepared', ns).find('pbs:number-repeats', ns).find('pbs:value', ns).text
benefit_type = {
'http://pbs.gov.au/benefit-type/unrestricted': 'unrestricted',
'http://pbs.gov.au/benefit-type/restricted': 'restricted',
'http://pbs.gov.au/benefit-type/streamlined': 'streamlined',
'http://pbs.gov.au/benefit-type/authority-required': 'authority',
}[benefit.get('{http://www.w3.org/1999/02/22-rdf-syntax-ns#}resource')]
cur.execute('INSERT INTO pbs_item (code, mpp_code, maximum_prescribable_units, number_repeats, benefit_type, program) VALUES (?, ?, ?, ?, ?, ?)', (code, mpp_code, max_units, max_repeats, benefit_type, program_code))
# Get restrictions
if restrictions := benefit.find('pbs:restriction-references-list', ns):
for restriction_reference in restrictions.findall('pbs:restriction-reference', ns):
restriction_id = restriction_reference.get('{http://www.w3.org/1999/xlink}href').lstrip('#')
restriction_code = restriction_reference.find('pbs:code', ns).text
cur.execute('INSERT INTO pbs_item_restriction (item_code, restriction_code) VALUES (?, ?)', (code, restriction_code))
# Queue this restriction for parsing
restrictions_to_parse.add(restriction_id)
# Queue the MPP for parsing
mpps_to_parse.add(mpp_id)
parse_program('GE') # General Schedule
parse_program('R1') # Repatriation PBS
parse_program('HB') # Section 100 (Highly Specialised Drugs) - Public Hospitals
# ----------------
# Parse MPPs, etc.
# Parse MPPs
for mpp_id in sorted(list(mpps_to_parse)):
mpp = root.find('pbs:drugs-list', ns).find('pbs:mpp[@xml:id="' + mpp_id + '"]', ns)
mpp_code = mpp.find('pbs:code', ns).text
mpp_preferred_term = mpp.find('pbs:preferred-term', ns).text
mp_id = mpp.find('pbs:drug-references-list', ns).find('pbs:mp-reference', ns).get('{http://www.w3.org/1999/xlink}href').lstrip('#')
mp = root.find('pbs:drugs-list', ns).find('pbs:mp[@xml:id="' + mp_id + '"]', ns)
mp_code = mp.find('pbs:code[@rdf:resource="http://pbs.gov.au/Drug/MP"]', ns).text # Must look this up because the <code> in <mp-reference> is only SNOMED
# -------------
# Manual fixups
# Metoprolol
if 'METOPROLOL' in mpp_preferred_term:
# Incorrect capitalisation
mpp_preferred_term = mpp_preferred_term.replace('METOPROLOL SUCCINATE Tablet', 'metoprolol succinate')
mpp_preferred_term = mpp_preferred_term.replace('METOPROLOL TARTRATE Tablet', 'metoprolol tartrate')
# Idiosyncratic word order
mpp_preferred_term = mpp_preferred_term.replace('(controlled release)', 'modified release tablet')
mpp_preferred_term = mpp_preferred_term.replace('mg,', 'mg tablet,')
# Classify as "metoprolol tartrate"
if mp_code == '1187PBSC':
mp_id = None
mp_code = '432PBSC'
# Magnesium
if mpp_preferred_term.startswith('magnesium 37.4 mg'):
# Specify form
mpp_preferred_term = mpp_preferred_term.replace('magnesium 37.4 mg', 'magnesium (as aspartate) 37.4 mg')
cur.execute('INSERT INTO pbs_mpp (code, mp_code, preferred_term) VALUES (?, ?, ?)', (mpp_code, mp_code, mpp_preferred_term))
# Queue the MP for parsing
if mp_id:
mps_to_parse.add(mp_id)
# Get TPPs
for tpp_reference in mpp.find('pbs:drug-references-list', ns).findall('pbs:tpp-reference', ns):
tpp_id = tpp_reference.get('{http://www.w3.org/1999/xlink}href').lstrip('#')
# Queue the TPP for parsing
tpps_to_parse.add((tpp_id, mpp_code))
# Parse MPs
for mp_id in sorted(list(mps_to_parse)):
mp = root.find('pbs:drugs-list', ns).find('pbs:mp[@xml:id="' + mp_id + '"]', ns)
mp_code = mp.find('pbs:code[@rdf:resource="http://pbs.gov.au/Drug/MP"]', ns).text # Also there are SNOMED codes but they are inconsistent
mp_preferred_term = mp.find('pbs:preferred-term[@rdf:resource="http://pbs.gov.au/clinical"]', ns).text
if mp_code == '432PBSC':
# Specified as all uppercase in PBS XML for some reason
mp_preferred_term = 'metoprolol'
if mp_code == '686PBSC':
# Specify form
mp_preferred_term = 'magnesium aspartate'
cur.execute('INSERT INTO pbs_mp (code, preferred_term) VALUES (?, ?)', (mp_code, mp_preferred_term))
# Parse TPPs
for (tpp_id, mpp_code) in sorted(list(tpps_to_parse)):
tpp = root.find('pbs:drugs-list', ns).find('pbs:tpp[@xml:id="' + tpp_id + '"]', ns)
tpp_code = tpp.find('pbs:code', ns).text
tpp_brand_name = tpp.find('pbs:brand-name', ns).find('pbs:value', ns).text
cur.execute('INSERT INTO pbs_tpp (code, mpp_code, brand_name) VALUES (?, ?, ?)', (tpp_code, mpp_code, tpp_brand_name))
# Parse restrictions
for restriction_id in sorted(list(restrictions_to_parse)):
restriction = root.find('pbs:prescribing-texts-list', ns).find('pbs:restriction[@xml:id="' + restriction_id + '"]', ns)
code = restriction.find('pbs:code[@rdf:resource="http://pbs.gov.au/code/restriction"]', ns).text
treatment_of = restriction.find('pbs:code[@rdf:resource="http://pbs.gov.au/code/treatment-of"]', ns).text
# Get treatment phase if any
treatment_phase = None
if treatment_phase_reference := restriction.find('pbs:treatment-phase-reference', ns):
treatment_phase_id = treatment_phase_reference.get('{http://www.w3.org/1999/xlink}href').lstrip('#')
treatment_phase_elem = root.find('pbs:prescribing-texts-list', ns).find('pbs:treatment-phase[@xml:id="' + treatment_phase_id + '"]', ns)
treatment_phase = treatment_phase_elem.find('pbs:preferred-term', ns).text
# Build the name of the indication (episodicity, severity, condition)
indication_id = restriction.find('pbs:indication-reference', ns).get('{http://www.w3.org/1999/xlink}href').lstrip('#')
indication = root.find('pbs:prescribing-texts-list', ns).find('pbs:indication[@xml:id="' + indication_id + '"]', ns)
indication_strings = []
if episodicity_reference := indication.find('pbs:episodicity-reference', ns):
episodicity_id = episodicity_reference.get('{http://www.w3.org/1999/xlink}href').lstrip('#')
episodicity = root.find('pbs:prescribing-texts-list', ns).find('pbs:episodicity[@xml:id="' + episodicity_id + '"]', ns)
episodicity_term = episodicity.find('pbs:preferred-term', ns).text.strip()
indication_strings.append(episodicity_term)
if severity_reference := indication.find('pbs:severity-reference', ns):
severity_id = severity_reference.get('{http://www.w3.org/1999/xlink}href').lstrip('#')
severity = root.find('pbs:prescribing-texts-list', ns).find('pbs:severity[@xml:id="' + severity_id + '"]', ns)
severity_term = severity.find('pbs:preferred-term', ns).text.strip()
indication_strings.append(severity_term)
if condition_reference := indication.find('pbs:condition-reference', ns):
condition_id = condition_reference.get('{http://www.w3.org/1999/xlink}href').lstrip('#')
condition = root.find('pbs:prescribing-texts-list', ns).find('pbs:condition[@xml:id="' + condition_id + '"]', ns)
condition_term = condition.find('pbs:preferred-term', ns).text.strip()
indication_strings.append(condition_term)
if not indication_strings:
# TODO: Might have a <block-container>
indication_strings = ['Unknown indication']
# Get operator and criteria
operators = restriction.findall('pbs:any', ns) + restriction.findall('pbs:all', ns) + restriction.findall('pbs:one-of', ns)
if operators:
operator = operators[0].tag[operators[0].tag.index('}')+1:]
for criteria_reference in operators[0].findall('pbs:criteria-reference', ns):
criteria_id = criteria_reference.get('{http://www.w3.org/1999/xlink}href').lstrip('#')
criteria_code = criteria_reference.find('pbs:code', ns).text
cur.execute('INSERT INTO pbs_restriction_criteria (restriction_code, criteria_code) VALUES (?, ?)', (code, criteria_code))
# Queue this criteria for parsing
criteria_to_parse.add(criteria_id)
else:
operator = None
cur.execute('INSERT INTO pbs_restriction (code, treatment_of, indication, treatment_phase, criteria_operator) VALUES (?, ?, ?, ?, ?)', (code, treatment_of, ' '.join(indication_strings), treatment_phase, operator))
# Parse criteria
for criteria_id in sorted(list(criteria_to_parse)):
criteria = root.find('pbs:prescribing-texts-list', ns).find('*[@xml:id="' + criteria_id + '"]', ns)
criteria_type = criteria.tag[criteria.tag.index('}')+1:]
code = criteria.find('pbs:code', ns).text
# Get operator and parameters
operators = criteria.findall('pbs:any', ns) + criteria.findall('pbs:all', ns) + criteria.findall('pbs:one-of', ns)
if operators:
operator = operators[0].tag[operators[0].tag.index('}')+1:]
for parameter_reference in operators[0].findall('pbs:parameter-reference', ns):
parameter_id = parameter_reference.get('{http://www.w3.org/1999/xlink}href').lstrip('#')
parameter = root.find('pbs:prescribing-texts-list', ns).find('*[@xml:id="' + parameter_id + '"]', ns)
note_text = '\n'.join(p.text for n in parameter.findall('dbk:note', ns) for p in n.findall('dbk:para', ns))
cur.execute('INSERT INTO pbs_criteria_parameter (criteria_code, text) VALUES (?, ?)', (code, note_text))
cur.execute('INSERT INTO pbs_criteria (code, type, parameters_operator) VALUES (?, ?, ?)', (code, criteria_type, operator))
con.commit()