chapter parser

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
#!/usr/bin/python
# -*- coding: utf-8 -*-
import os
import sys
import xml.dom.minidom
from xml.dom.minidom import Node
from xml.dom.minidom import Document
files = os.listdir(".")
htmls = filter(lambda x: x.endswith('.html'), files)
xml_ext = '.xml'
html_ext = '.html'
for i in htmls:
filename = i.split('.')[0]
filename = filename.split('.')[0]
l = len(filename)
file_count = filename[l-3:]
block_count = -1
doc = Document()
root = doc.createElement('resources')
doc.appendChild(root)
print(filename)
chapter = xml.dom.minidom.parse(filename + html_ext)
div = chapter.getElementsByTagName('div')
for div_node in div:
div_class = div_node.getAttribute('class')
if div_class == 'passage_original' or div_class == 'passage_detailed hidden_detailed':
string = doc.createElement('string')
string_h1 = doc.createElement('string')
if div_class == 'passage_original' and block_count != 0:
string.setAttribute('name', 'original_' + '%03d' % (block_count))
root.appendChild(string)
if div_class == 'passage_detailed hidden_detailed' and block_count != 0:
string.setAttribute('name', 'detailed_' + '%03d' % (block_count))
root.appendChild(string)
if div_class == 'passage_original' and block_count == 0:
string_h1.setAttribute('name', 'title_original_001')
root.appendChild(string_h1)
if div_class == 'passage_detailed hidden_detailed' and block_count == 0:
string_h1.setAttribute('name', 'title_detailed_001')
root.appendChild(string_h1)
p = div_node.getElementsByTagName('p')
p_tag = doc.createElement('p')
for p_node in p:
if p_node.getAttribute('class') != 'phonetic':
if p_node != p[0]:
br = doc.createElement('br')
p_tag.appendChild(br)
for child_p_node in p_node.childNodes:
if child_p_node.nodeType == Node.TEXT_NODE:
text = doc.createTextNode(child_p_node.data)
p_tag.appendChild(text)
if child_p_node.nodeType == Node.ELEMENT_NODE:
for comments in child_p_node.childNodes:
if comments.nodeType == Node.TEXT_NODE:
comments_font = doc.createElement('font')
comments_text = doc.createTextNode(comments.data)
comments_font.setAttribute('color', '#008000')
comments_font.appendChild(comments_text)
p_tag.appendChild(comments_font)
if comments.nodeType == Node.ELEMENT_NODE:
for child_comments in comments.childNodes:
italic = doc.createElement('i')
comments_font.appendChild(italic)
if comments.getAttribute('class') == 'dictionary':
dictionary_text = doc.createTextNode(child_comments.data)
dictionary_font = doc.createElement('font')
dictionary_font.setAttribute('color', '#2B788C')
dictionary_font.appendChild(dictionary_text)
italic.appendChild(dictionary_font)
if comments.getAttribute('class') == 'definition':
definition_text = doc.createTextNode(child_comments.data)
italic.appendChild(definition_text)
if p_node.getAttribute('class') == 'phonetic':
string1 = doc.createElement('string')
string1.setAttribute('name', 'phonetic_' + '%03d' % (block_count))
root.appendChild(string1)
p_t = doc.createElement('p')
t_str = ""
for child_p_node in p_node.childNodes:
if child_p_node.nodeType == Node.TEXT_NODE:
t_str += child_p_node.data
if child_p_node.nodeType == Node.ELEMENT_NODE:
for comments in child_p_node.childNodes:
if comments.nodeType == Node.TEXT_NODE:
t_str += comments.data
ph = doc.createTextNode(t_str)
p_t.appendChild(ph)
c_data_1 = doc.createCDATASection(p_t.toxml())
string1.appendChild(c_data_1)
h1 = div_node.getElementsByTagName('h1')
for h1_node in h1:
for child_h1_node in h1_node.childNodes:
h1 = doc.createElement('h1')
if child_h1_node.nodeType == Node.TEXT_NODE:
h1_text = doc.createTextNode(child_h1_node.data)
h1.appendChild(h1_text)
if child_h1_node.nodeType == Node.ELEMENT_NODE:
for comments in child_h1_node.childNodes:
if comments.nodeType == Node.TEXT_NODE:
comments_h1_font = doc.createElement('font')
comments_h1_text = doc.createTextNode(comments.data)
comments_h1_font.setAttribute('color', '#008000')
comments_h1_font.appendChild(comments_h1_text)
h1.appendChild(comments_h1_font)
c_data_h1 = doc.createCDATASection(h1.toxml())
string_h1.appendChild(c_data_h1)
c_data = doc.createCDATASection(p_tag.toxml())
string.appendChild(c_data)
else:
block_count = block_count + 1
doc.writexml(open('chapter_' + file_count + xml_ext, 'w'), encoding= 'utf-8', newl='\n')
#doc.writexml(open('chapter_' + file_count + xml_ext, 'w'), indent=" ", addindent=" ", newl='\n')
doc.unlink()