#!/usr/bin/env python3 # Copyright © 2023 Lee Yingtong Li (RunasSudo) # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Affero General Public License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . import os import sqlite3 import zipfile from xml.etree import ElementTree as ET # Open database con = sqlite3.connect('database.db') cur = con.cursor() # Init schema cur.execute('DROP TABLE IF EXISTS meta') cur.execute('CREATE TABLE meta (key TEXT PRIMARY KEY, value TEXT)') cur.execute('DROP TABLE IF EXISTS pbs_item') cur.execute('CREATE TABLE pbs_item (code TEXT PRIMARY KEY, mpp_code TEXT, maximum_prescribable_units INTEGER, number_repeats INTEGER, benefit_type TEXT, program TEXT)') cur.execute('DROP TABLE IF EXISTS pbs_mpp') cur.execute('CREATE TABLE pbs_mpp (code TEXT PRIMARY KEY, mp_code TEXT, preferred_term TEXT)') cur.execute('DROP TABLE IF EXISTS pbs_tpp') cur.execute('CREATE TABLE pbs_tpp (code TEXT PRIMARY KEY, mpp_code TEXT, brand_name TEXT)') cur.execute('DROP TABLE IF EXISTS pbs_mp') cur.execute('CREATE TABLE pbs_mp (code TEXT PRIMARY KEY, preferred_term TEXT)') cur.execute('DROP TABLE IF EXISTS pbs_item_restriction') cur.execute('CREATE TABLE pbs_item_restriction (item_code TEXT, restriction_code INTEGER)') cur.execute('DROP TABLE IF EXISTS pbs_restriction') cur.execute('CREATE TABLE pbs_restriction (code INTEGER PRIMARY KEY, treatment_of INTEGER, indication TEXT, treatment_phase TEXT, criteria_operator TEXT, criteria_rendered TEXT)') cur.execute('DROP TABLE IF EXISTS pbs_restriction_criteria') cur.execute('CREATE TABLE pbs_restriction_criteria (restriction_code INTEGER, criteria_code INTEGER)') cur.execute('DROP TABLE IF EXISTS pbs_criteria') cur.execute('CREATE TABLE pbs_criteria (code INTEGER PRIMARY KEY, type TEXT, parameters_operator TEXT)') cur.execute('DROP TABLE IF EXISTS pbs_criteria_parameter') cur.execute('CREATE TABLE pbs_criteria_parameter (id INTEGER PRIMARY KEY AUTOINCREMENT, criteria_code INTEGER, text TEXT)') # Parse XML pbs_zip_file = sorted([f for f in os.listdir('data') if f.endswith('-xml-V3.zip')])[-1] with zipfile.ZipFile('data/' + pbs_zip_file, 'r') as zipf: pbs_xml_file = next(f for f in zipf.namelist() if f.endswith('.xml')) with zipf.open(pbs_xml_file, 'r') as f: tree = ET.parse(f) print('Parsed XML') root = tree.getroot() ns = {'pbs': 'http://schema.pbs.gov.au/', 'xlink': 'http://www.w3.org/1999/xlink', 'xml': 'http://www.w3.org/XML/1998/namespace', 'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#', 'dbk': 'http://docbook.org/ns/docbook', 'dct': 'http://purl.org/dc/terms/'} # Write meta cur.execute('INSERT INTO meta (key, value) VALUES (?, ?)', ('pbs_date', root.find('pbs:info', ns).find('dct:valid', ns).text)) # ----------------------------------------- # Parse items from each desired PBS program mpps_to_parse = set() tpps_to_parse = set() mps_to_parse = set() restrictions_to_parse = set() criteria_to_parse = set() def parse_program(program_code): # Get program program = next(p for p in root.find('pbs:schedule', ns).findall('pbs:program', ns) if p.find('pbs:info', ns).find('pbs:code', ns).text == program_code) # Get schedule items (prescribing-rule) in schedule for item in program.findall('pbs:prescribing-rule', ns): code = item.find('pbs:code', ns).text # Only get benefits available to medical practitioners benefits = [b for b in item.find('pbs:benefit-types-list', ns).findall('pbs:benefit-type', ns) if b.find('pbs:member-of-list', ns).find('pbs:member-of[@rdf:resource="http://pbs.gov.au/prescriber/medical"]', ns)] if not benefits: continue assert len(benefits) == 1 benefit = benefits[0] mpp_id = item.find('pbs:ready-prepared', ns).find('pbs:mpp-reference', ns).get('{http://www.w3.org/1999/xlink}href').lstrip('#') mpp_code = item.find('pbs:ready-prepared', ns).find('pbs:mpp-reference', ns).find('pbs:code', ns).text max_units = item.find('pbs:ready-prepared', ns).find('pbs:maximum-prescribable[@rdf:resource="http://pbs.gov.au/reference/unit-of-use"]', ns).find('pbs:value', ns).text max_repeats = item.find('pbs:ready-prepared', ns).find('pbs:number-repeats', ns).find('pbs:value', ns).text benefit_type = { 'http://pbs.gov.au/benefit-type/unrestricted': 'unrestricted', 'http://pbs.gov.au/benefit-type/restricted': 'restricted', 'http://pbs.gov.au/benefit-type/streamlined': 'streamlined', 'http://pbs.gov.au/benefit-type/authority-required': 'authority', }[benefit.get('{http://www.w3.org/1999/02/22-rdf-syntax-ns#}resource')] cur.execute('INSERT INTO pbs_item (code, mpp_code, maximum_prescribable_units, number_repeats, benefit_type, program) VALUES (?, ?, ?, ?, ?, ?)', (code, mpp_code, max_units, max_repeats, benefit_type, program_code)) # Get restrictions if restrictions := benefit.find('pbs:restriction-references-list', ns): for restriction_reference in restrictions.findall('pbs:restriction-reference', ns): restriction_id = restriction_reference.get('{http://www.w3.org/1999/xlink}href').lstrip('#') restriction_code = restriction_reference.find('pbs:code', ns).text cur.execute('INSERT INTO pbs_item_restriction (item_code, restriction_code) VALUES (?, ?)', (code, restriction_code)) # Queue this restriction for parsing restrictions_to_parse.add(restriction_id) # Queue the MPP for parsing mpps_to_parse.add(mpp_id) parse_program('GE') # General Schedule parse_program('R1') # Repatriation PBS parse_program('HB') # Section 100 (Highly Specialised Drugs) - Public Hospitals # ---------------- # Parse MPPs, etc. # Parse MPPs for mpp_id in sorted(list(mpps_to_parse)): mpp = root.find('pbs:drugs-list', ns).find('pbs:mpp[@xml:id="' + mpp_id + '"]', ns) mpp_code = mpp.find('pbs:code', ns).text mpp_preferred_term = mpp.find('pbs:preferred-term', ns).text mp_id = mpp.find('pbs:drug-references-list', ns).find('pbs:mp-reference', ns).get('{http://www.w3.org/1999/xlink}href').lstrip('#') mp = root.find('pbs:drugs-list', ns).find('pbs:mp[@xml:id="' + mp_id + '"]', ns) mp_code = mp.find('pbs:code[@rdf:resource="http://pbs.gov.au/Drug/MP"]', ns).text # Must look this up because the in is only SNOMED # ------------- # Manual fixups # Metoprolol if 'METOPROLOL' in mpp_preferred_term: # Incorrect capitalisation mpp_preferred_term = mpp_preferred_term.replace('METOPROLOL SUCCINATE Tablet', 'metoprolol succinate') mpp_preferred_term = mpp_preferred_term.replace('METOPROLOL TARTRATE Tablet', 'metoprolol tartrate') # Idiosyncratic word order mpp_preferred_term = mpp_preferred_term.replace('(controlled release)', 'modified release tablet') mpp_preferred_term = mpp_preferred_term.replace('mg,', 'mg tablet,') # Classify as "metoprolol tartrate" if mp_code == '1187PBSC': mp_id = None mp_code = '432PBSC' # Magnesium if mpp_preferred_term.startswith('magnesium 37.4 mg'): # Specify form mpp_preferred_term = mpp_preferred_term.replace('magnesium 37.4 mg', 'magnesium (as aspartate) 37.4 mg') cur.execute('INSERT INTO pbs_mpp (code, mp_code, preferred_term) VALUES (?, ?, ?)', (mpp_code, mp_code, mpp_preferred_term)) # Queue the MP for parsing if mp_id: mps_to_parse.add(mp_id) # Get TPPs for tpp_reference in mpp.find('pbs:drug-references-list', ns).findall('pbs:tpp-reference', ns): tpp_id = tpp_reference.get('{http://www.w3.org/1999/xlink}href').lstrip('#') # Queue the TPP for parsing tpps_to_parse.add((tpp_id, mpp_code)) # Parse MPs for mp_id in sorted(list(mps_to_parse)): mp = root.find('pbs:drugs-list', ns).find('pbs:mp[@xml:id="' + mp_id + '"]', ns) mp_code = mp.find('pbs:code[@rdf:resource="http://pbs.gov.au/Drug/MP"]', ns).text # Also there are SNOMED codes but they are inconsistent mp_preferred_term = mp.find('pbs:preferred-term[@rdf:resource="http://pbs.gov.au/clinical"]', ns).text if mp_code == '432PBSC': # Specified as all uppercase in PBS XML for some reason mp_preferred_term = 'metoprolol' if mp_code == '686PBSC': # Specify form mp_preferred_term = 'magnesium aspartate' cur.execute('INSERT INTO pbs_mp (code, preferred_term) VALUES (?, ?)', (mp_code, mp_preferred_term)) # Parse TPPs for (tpp_id, mpp_code) in sorted(list(tpps_to_parse)): tpp = root.find('pbs:drugs-list', ns).find('pbs:tpp[@xml:id="' + tpp_id + '"]', ns) tpp_code = tpp.find('pbs:code', ns).text tpp_brand_name = tpp.find('pbs:brand-name', ns).find('pbs:value', ns).text cur.execute('INSERT INTO pbs_tpp (code, mpp_code, brand_name) VALUES (?, ?, ?)', (tpp_code, mpp_code, tpp_brand_name)) # Parse restrictions for restriction_id in sorted(list(restrictions_to_parse)): restriction = root.find('pbs:prescribing-texts-list', ns).find('pbs:restriction[@xml:id="' + restriction_id + '"]', ns) code = restriction.find('pbs:code[@rdf:resource="http://pbs.gov.au/code/restriction"]', ns).text treatment_of = restriction.find('pbs:code[@rdf:resource="http://pbs.gov.au/code/treatment-of"]', ns).text # Get treatment phase if any treatment_phase = None if treatment_phase_reference := restriction.find('pbs:treatment-phase-reference', ns): treatment_phase_id = treatment_phase_reference.get('{http://www.w3.org/1999/xlink}href').lstrip('#') treatment_phase_elem = root.find('pbs:prescribing-texts-list', ns).find('pbs:treatment-phase[@xml:id="' + treatment_phase_id + '"]', ns) treatment_phase = treatment_phase_elem.find('pbs:preferred-term', ns).text # Build the name of the indication (episodicity, severity, condition) indication_id = restriction.find('pbs:indication-reference', ns).get('{http://www.w3.org/1999/xlink}href').lstrip('#') indication = root.find('pbs:prescribing-texts-list', ns).find('pbs:indication[@xml:id="' + indication_id + '"]', ns) indication_strings = [] if episodicity_reference := indication.find('pbs:episodicity-reference', ns): episodicity_id = episodicity_reference.get('{http://www.w3.org/1999/xlink}href').lstrip('#') episodicity = root.find('pbs:prescribing-texts-list', ns).find('pbs:episodicity[@xml:id="' + episodicity_id + '"]', ns) episodicity_term = episodicity.find('pbs:preferred-term', ns).text.strip() indication_strings.append(episodicity_term) if severity_reference := indication.find('pbs:severity-reference', ns): severity_id = severity_reference.get('{http://www.w3.org/1999/xlink}href').lstrip('#') severity = root.find('pbs:prescribing-texts-list', ns).find('pbs:severity[@xml:id="' + severity_id + '"]', ns) severity_term = severity.find('pbs:preferred-term', ns).text.strip() indication_strings.append(severity_term) if condition_reference := indication.find('pbs:condition-reference', ns): condition_id = condition_reference.get('{http://www.w3.org/1999/xlink}href').lstrip('#') condition = root.find('pbs:prescribing-texts-list', ns).find('pbs:condition[@xml:id="' + condition_id + '"]', ns) condition_term = condition.find('pbs:preferred-term', ns).text.strip() indication_strings.append(condition_term) if not indication_strings: # TODO: Might have a indication_strings = ['Unknown indication'] # Get operator and criteria operators = restriction.findall('pbs:any', ns) + restriction.findall('pbs:all', ns) + restriction.findall('pbs:one-of', ns) if operators: operator = operators[0].tag[operators[0].tag.index('}')+1:] for criteria_reference in operators[0].findall('pbs:criteria-reference', ns): criteria_id = criteria_reference.get('{http://www.w3.org/1999/xlink}href').lstrip('#') criteria_code = criteria_reference.find('pbs:code', ns).text cur.execute('INSERT INTO pbs_restriction_criteria (restriction_code, criteria_code) VALUES (?, ?)', (code, criteria_code)) # Queue this criteria for parsing criteria_to_parse.add(criteria_id) else: operator = None cur.execute('INSERT INTO pbs_restriction (code, treatment_of, indication, treatment_phase, criteria_operator) VALUES (?, ?, ?, ?, ?)', (code, treatment_of, ' '.join(indication_strings), treatment_phase, operator)) # Parse criteria for criteria_id in sorted(list(criteria_to_parse)): criteria = root.find('pbs:prescribing-texts-list', ns).find('*[@xml:id="' + criteria_id + '"]', ns) criteria_type = criteria.tag[criteria.tag.index('}')+1:] code = criteria.find('pbs:code', ns).text # Get operator and parameters operators = criteria.findall('pbs:any', ns) + criteria.findall('pbs:all', ns) + criteria.findall('pbs:one-of', ns) if operators: operator = operators[0].tag[operators[0].tag.index('}')+1:] for parameter_reference in operators[0].findall('pbs:parameter-reference', ns): parameter_id = parameter_reference.get('{http://www.w3.org/1999/xlink}href').lstrip('#') parameter = root.find('pbs:prescribing-texts-list', ns).find('*[@xml:id="' + parameter_id + '"]', ns) note_text = '\n'.join(p.text for n in parameter.findall('dbk:note', ns) for p in n.findall('dbk:para', ns)) cur.execute('INSERT INTO pbs_criteria_parameter (criteria_code, text) VALUES (?, ?)', (code, note_text)) cur.execute('INSERT INTO pbs_criteria (code, type, parameters_operator) VALUES (?, ?, ?)', (code, criteria_type, operator)) con.commit()