1 |
dpavlin |
325 |
#!/usr/bin/perl -w |
2 |
|
|
|
3 |
|
|
=head1 NAME |
4 |
|
|
|
5 |
|
|
mods2marc.pl - convert MODS XML back to MARC (ISO2709) |
6 |
|
|
|
7 |
|
|
=head1 SYNOPSIS |
8 |
|
|
|
9 |
dpavlin |
328 |
mods2marc.pl export.marc mods.xml [mods2.xml ... ] |
10 |
dpavlin |
325 |
|
11 |
|
|
=head1 DESCRIPTION |
12 |
|
|
|
13 |
|
|
This script will convert MODS format |
14 |
|
|
L<http://www.loc.gov/standards/mods/> |
15 |
|
|
back to MARC (ISO2709) format. |
16 |
|
|
|
17 |
|
|
Since conversion back to MARC is not simple, lot of things are hard-coded |
18 |
|
|
in this script. |
19 |
|
|
|
20 |
|
|
This script B<is somewhat specific> to MODS export from |
21 |
|
|
Faculty of Electrical Engineering and Computing |
22 |
dpavlin |
326 |
so you might want to edit it (among other thing, it includes a lot |
23 |
|
|
of fields which are in Croatian). |
24 |
dpavlin |
325 |
|
25 |
dpavlin |
326 |
Feel free to hack this script and convert it to your own needs. |
26 |
|
|
|
27 |
dpavlin |
327 |
=head1 CAVEAT |
28 |
dpavlin |
325 |
|
29 |
dpavlin |
327 |
This script will parse imput XML twice: once with C<XML::Twig> and |
30 |
|
|
then each entry with C<XML::Simple> to produce in-memory structure. |
31 |
|
|
That's because I wanted to keep node selection logical (and perl-like). |
32 |
dpavlin |
325 |
|
33 |
dpavlin |
327 |
If you don't like it, you can rewrite this script to use XPATH. I tried |
34 |
|
|
and failed (it seems that MODS is too complicated for my limited knowledge |
35 |
|
|
of XPATH). |
36 |
|
|
|
37 |
dpavlin |
325 |
=cut |
38 |
|
|
|
39 |
|
|
use strict; |
40 |
|
|
use XML::Twig; |
41 |
|
|
use XML::Simple; |
42 |
|
|
use MARC; |
43 |
|
|
use Text::Iconv; |
44 |
|
|
|
45 |
|
|
use Data::Dumper; |
46 |
|
|
|
47 |
dpavlin |
328 |
my $marc_file = shift @ARGV || die "$0: need MARC export file"; |
48 |
|
|
die "$0: need at least one MODS XML file" if (! @ARGV); |
49 |
dpavlin |
325 |
|
50 |
|
|
$|=1; |
51 |
|
|
my $nr = 0; |
52 |
|
|
|
53 |
|
|
my $marc = MARC->new; |
54 |
|
|
|
55 |
dpavlin |
327 |
my $ENCODING = 'ISO-8859-2'; |
56 |
|
|
|
57 |
dpavlin |
325 |
my $twig=XML::Twig->new( |
58 |
dpavlin |
327 |
twig_roots => { 'mods' => \&mods }, |
59 |
|
|
output_encoding => 'UTF8', |
60 |
dpavlin |
325 |
); |
61 |
|
|
|
62 |
dpavlin |
327 |
my $utf2iso = Text::Iconv->new("UTF8", $ENCODING); |
63 |
dpavlin |
325 |
|
64 |
dpavlin |
328 |
foreach my $xml_file (@ARGV) { |
65 |
|
|
print "$xml_file: "; |
66 |
|
|
$twig->parsefile($xml_file); |
67 |
|
|
$twig->purge; |
68 |
|
|
print "$nr\n"; |
69 |
|
|
} |
70 |
dpavlin |
325 |
|
71 |
dpavlin |
328 |
print "Saving MARC file...\n"; |
72 |
|
|
|
73 |
dpavlin |
325 |
$marc->output({file=>"> $marc_file",'format'=>"usmarc"}); |
74 |
|
|
|
75 |
dpavlin |
327 |
sub mods { |
76 |
dpavlin |
325 |
my( $t, $elt)= @_; |
77 |
|
|
|
78 |
|
|
my $xml=$elt->xml_string; |
79 |
dpavlin |
327 |
my $ref = XMLin('<xml>'.$xml.'</xml>', |
80 |
dpavlin |
325 |
ForceArray => [ |
81 |
|
|
'name', |
82 |
|
|
'classification', |
83 |
|
|
'topic', |
84 |
dpavlin |
328 |
'relatedItem', |
85 |
|
|
'partNumber', |
86 |
dpavlin |
325 |
], |
87 |
|
|
KeyAttr => { |
88 |
|
|
'namePart' => 'type', |
89 |
|
|
'identifier' => 'type', |
90 |
|
|
'namePart' => 'type', |
91 |
|
|
'role' => 'type', |
92 |
|
|
}, |
93 |
|
|
GroupTags => { |
94 |
|
|
'place' => 'placeTerm', |
95 |
|
|
'physicalDescription' => 'extent', |
96 |
|
|
'roleTerm' => 'content', |
97 |
|
|
}, |
98 |
|
|
ContentKey => '-content', |
99 |
|
|
); |
100 |
|
|
|
101 |
dpavlin |
326 |
my $m_cache; |
102 |
dpavlin |
325 |
|
103 |
dpavlin |
326 |
sub marc_add { |
104 |
dpavlin |
327 |
my $m_cache = \shift || die "need m_cache"; |
105 |
dpavlin |
326 |
my $fld = shift || die "need field!"; |
106 |
dpavlin |
327 |
my $sf = shift; |
107 |
|
|
my $data = shift || return; |
108 |
dpavlin |
325 |
|
109 |
dpavlin |
327 |
#print "add: $fld",($sf ? "^".$sf : ''),": $data\n"; |
110 |
dpavlin |
325 |
|
111 |
dpavlin |
327 |
if ($sf) { |
112 |
|
|
push @{$$m_cache->{tmp}->{$fld}}, $sf; |
113 |
dpavlin |
325 |
} |
114 |
dpavlin |
327 |
push @{$$m_cache->{tmp}->{$fld}}, $utf2iso->convert($data); |
115 |
|
|
} |
116 |
dpavlin |
325 |
|
117 |
dpavlin |
327 |
sub marc_rep { |
118 |
|
|
my $m_cache = \shift || die "need m_cache"; |
119 |
|
|
foreach my $fld (@_) { |
120 |
|
|
#print "marc_rep: $fld\n"; |
121 |
|
|
push @{$$m_cache->{array}->{$fld}}, [ @{$$m_cache->{tmp}->{$fld}} ] if ($$m_cache->{tmp}->{$fld}); |
122 |
|
|
delete $$m_cache->{tmp}->{$fld}; |
123 |
|
|
} |
124 |
|
|
} |
125 |
dpavlin |
325 |
|
126 |
dpavlin |
327 |
sub marc_single { |
127 |
|
|
my $m_cache = \shift || die "need m_cache"; |
128 |
|
|
foreach my $fld (@_) { |
129 |
|
|
#print "marc_single: $fld\n"; |
130 |
dpavlin |
325 |
|
131 |
dpavlin |
327 |
die "$fld already defined! not single?" if ($$m_cache->{single}->{$fld}); |
132 |
dpavlin |
326 |
|
133 |
dpavlin |
327 |
$$m_cache->{single}->{$fld} = \@{$$m_cache->{tmp}->{$fld}} if ($$m_cache->{tmp}->{$fld}); |
134 |
|
|
delete $$m_cache->{tmp}->{$fld}; |
135 |
|
|
} |
136 |
dpavlin |
325 |
} |
137 |
|
|
|
138 |
dpavlin |
327 |
sub marc_add_rep { |
139 |
|
|
my $m_cache = \shift || die "need m_cache"; |
140 |
|
|
my $fld = shift || die "need field!"; |
141 |
|
|
my $sf = shift; |
142 |
|
|
my $data = shift || return; |
143 |
|
|
|
144 |
|
|
marc_add($$m_cache,$fld,$sf,$data); |
145 |
|
|
marc_rep($$m_cache,$fld); |
146 |
|
|
} |
147 |
|
|
|
148 |
|
|
sub marc_add_single { |
149 |
|
|
my $m_cache = \shift || die "need m_cache"; |
150 |
|
|
my $fld = shift || die "need field!"; |
151 |
|
|
my $sf = shift; |
152 |
|
|
my $data = shift || return; |
153 |
|
|
|
154 |
|
|
marc_add($$m_cache,$fld,$sf,$data); |
155 |
|
|
marc_single($$m_cache,$fld); |
156 |
|
|
} |
157 |
|
|
|
158 |
dpavlin |
326 |
my $journal = 0; |
159 |
dpavlin |
327 |
# Journals start with c- in our MODS |
160 |
dpavlin |
326 |
$journal = 1 if ($ref->{recordInfo}->{recordIdentifier} =~ m/^c-/); |
161 |
dpavlin |
325 |
|
162 |
dpavlin |
327 |
foreach my $t (@{$ref->{subject}->{topic}}) { |
163 |
|
|
marc_add($m_cache,'610','a', $t); |
164 |
|
|
marc_rep($m_cache,'610'); |
165 |
|
|
} |
166 |
dpavlin |
325 |
|
167 |
dpavlin |
327 |
my $fld_700 = '700'; |
168 |
|
|
my $fld_710 = '710'; |
169 |
|
|
|
170 |
dpavlin |
325 |
foreach my $name (@{$ref->{name}}) { |
171 |
|
|
my $role = $name->{role}->{roleTerm}->{content}; |
172 |
|
|
next if (! $role); |
173 |
|
|
if ($role eq "author") { |
174 |
dpavlin |
327 |
marc_add($m_cache,$fld_700,'a',$name->{namePart}->{family}); |
175 |
|
|
marc_add($m_cache,$fld_700,'b',$name->{namePart}->{given}); |
176 |
|
|
marc_add($m_cache,$fld_700,'4',$role); |
177 |
dpavlin |
325 |
|
178 |
dpavlin |
327 |
marc_rep($m_cache,$fld_700); |
179 |
|
|
|
180 |
dpavlin |
325 |
# first author goes in 700, others in 701 |
181 |
dpavlin |
327 |
$fld_700 = '701'; |
182 |
dpavlin |
325 |
} elsif ($role eq "editor" or $role eq "illustrator") { |
183 |
dpavlin |
326 |
marc_add($m_cache,'702','a',$name->{namePart}->{family}); |
184 |
|
|
marc_add($m_cache,'702','b',$name->{namePart}->{given}); |
185 |
|
|
marc_add($m_cache,'702','4',$role); |
186 |
dpavlin |
327 |
marc_rep($m_cache,'702'); |
187 |
|
|
} elsif ($role eq "corporate") { |
188 |
|
|
marc_add_single($m_cache,"$fld_710\t0 ",'a',$name->{namePart}); |
189 |
|
|
$fld_710 = '711'; |
190 |
|
|
} elsif ($role eq "conference") { |
191 |
|
|
marc_add_single($m_cache,"$fld_710\t1 ",'a',$name->{namePart}); |
192 |
|
|
$fld_710 = '711'; |
193 |
dpavlin |
325 |
} else { |
194 |
|
|
die "FATAL: don't know how to map role '$role'" if ($role); |
195 |
|
|
} |
196 |
|
|
} |
197 |
|
|
|
198 |
|
|
my $note = $ref->{note}; |
199 |
|
|
|
200 |
|
|
if ($note) { |
201 |
|
|
foreach my $n (split(/\s*;\s+/, $note)) { |
202 |
|
|
if ($n =~ s/bibliogr:\s+//i) { |
203 |
dpavlin |
327 |
marc_add_rep($m_cache,'320','a',"Bibliografija: $n"); |
204 |
dpavlin |
325 |
} elsif ($n =~ s/ilustr:\s+//i) { |
205 |
dpavlin |
326 |
marc_add($m_cache,'215','c', $n); |
206 |
dpavlin |
325 |
} else { |
207 |
dpavlin |
327 |
marc_add_rep($m_cache,'320','a',$n); |
208 |
dpavlin |
325 |
} |
209 |
|
|
} |
210 |
|
|
} |
211 |
|
|
|
212 |
dpavlin |
327 |
|
213 |
dpavlin |
325 |
my $type = $ref->{identifier}->{type}; |
214 |
|
|
|
215 |
|
|
if ($type) { |
216 |
|
|
if ($type eq "isbn") { |
217 |
dpavlin |
327 |
marc_add_rep($m_cache,'010','a',$ref->{identifier}->{content}); |
218 |
dpavlin |
325 |
} elsif ($type eq "issn") { |
219 |
dpavlin |
327 |
marc_add_rep($m_cache,'011','a',$ref->{identifier}->{content}); |
220 |
dpavlin |
328 |
} elsif ($type eq "uri") { |
221 |
|
|
marc_add_rep($m_cache,'856','u',$ref->{identifier}->{content}); |
222 |
dpavlin |
325 |
} else { |
223 |
|
|
die "unknown identifier type $type"; |
224 |
|
|
} |
225 |
|
|
} |
226 |
|
|
|
227 |
|
|
my $phy_desc = $ref->{physicalDescription}; |
228 |
|
|
if ($phy_desc) { |
229 |
|
|
my $tmp; |
230 |
|
|
foreach my $t (split(/\s*;\s+/, $phy_desc)) { |
231 |
|
|
if ($t =~ m/([^:]+):\s+(.+)$/) { |
232 |
|
|
$tmp->{$1} = $2; |
233 |
|
|
} else { |
234 |
dpavlin |
329 |
print STDERR "can't parse '$t' in ",Dumper($phy_desc); |
235 |
dpavlin |
325 |
} |
236 |
|
|
} |
237 |
|
|
my $data = $tmp->{pagin}; |
238 |
|
|
$data .= ", " if ($data); |
239 |
|
|
if ($tmp->{str}) { |
240 |
|
|
$data .= $tmp->{str}." str"; |
241 |
|
|
} |
242 |
dpavlin |
326 |
marc_add($m_cache,'215','a', $data) if ($data); |
243 |
|
|
marc_add($m_cache,'215','d', $tmp->{visina}); |
244 |
dpavlin |
325 |
} |
245 |
dpavlin |
327 |
marc_rep($m_cache,'215'); |
246 |
dpavlin |
325 |
|
247 |
dpavlin |
327 |
marc_add_single($m_cache,'001',undef,$ref->{recordInfo}->{recordIdentifier}); |
248 |
dpavlin |
325 |
|
249 |
dpavlin |
326 |
marc_add($m_cache,'200','a',$ref->{titleInfo}->{title}); |
250 |
|
|
marc_add($m_cache,'200','e',$ref->{titleInfo}->{subTitle}); |
251 |
dpavlin |
327 |
marc_single($m_cache,'200'); |
252 |
dpavlin |
325 |
|
253 |
dpavlin |
327 |
foreach my $c (@{$ref->{classification}}) { |
254 |
|
|
if ($c->{'authority'} eq "udc") { |
255 |
|
|
marc_add_rep($m_cache,'675','a', $c->{'content'}); |
256 |
|
|
} |
257 |
|
|
} |
258 |
dpavlin |
325 |
|
259 |
dpavlin |
328 |
foreach my $ri (@{$ref->{relatedItem}}) { |
260 |
|
|
my $related = $ri->{type}; |
261 |
|
|
if ($related) { |
262 |
|
|
if ($related eq "series") { |
263 |
|
|
marc_add_rep($m_cache,'225','a',$ri->{titleInfo}->{title}); |
264 |
|
|
foreach my $pn (@{$ri->{titleInfo}->{partNumber}}) { |
265 |
dpavlin |
330 |
if ($journal) { |
266 |
|
|
marc_add_rep($m_cache,'999','a',$pn); |
267 |
|
|
} else { |
268 |
|
|
marc_add_rep($m_cache,'225','v',$pn); |
269 |
|
|
} |
270 |
dpavlin |
328 |
} |
271 |
|
|
} elsif ($related eq "preceding") { |
272 |
dpavlin |
330 |
marc_add($m_cache,'520','a',$ri->{titleInfo}->{title}); |
273 |
|
|
if ($ri->{identifier}) { |
274 |
|
|
if ($ri->{identifier}->{type} eq "issn") { |
275 |
|
|
marc_add($m_cache,'520','x',$ri->{identifier}->{content}); |
276 |
|
|
} else { |
277 |
|
|
die "can't store identifier type $type"; |
278 |
|
|
} |
279 |
|
|
} |
280 |
|
|
marc_rep($m_cache,'520'); |
281 |
dpavlin |
328 |
} else { |
282 |
|
|
die "can't parse related item type $related" if ($related); |
283 |
|
|
} |
284 |
dpavlin |
325 |
} |
285 |
|
|
} |
286 |
|
|
|
287 |
dpavlin |
327 |
marc_add_single($m_cache,'205','a',$ref->{originInfo}->{edition}); |
288 |
dpavlin |
325 |
|
289 |
dpavlin |
329 |
marc_add($m_cache,'210','a',$ref->{originInfo}->{place}); |
290 |
|
|
|
291 |
dpavlin |
325 |
my $publisher = $ref->{originInfo}->{publisher}; |
292 |
|
|
if ($publisher =~ m,^(.+?)\s*/\s*(.+)$,) { |
293 |
dpavlin |
326 |
marc_add($m_cache,'210','a', $2); |
294 |
|
|
marc_add($m_cache,'210','c', $1); |
295 |
dpavlin |
325 |
} else { |
296 |
dpavlin |
326 |
marc_add($m_cache,'210','c', $publisher); |
297 |
dpavlin |
325 |
} |
298 |
|
|
|
299 |
dpavlin |
326 |
marc_add($m_cache,'210','d',$ref->{originInfo}->{dateIssued}); |
300 |
|
|
|
301 |
dpavlin |
327 |
marc_single($m_cache,'210'); |
302 |
|
|
|
303 |
|
|
marc_add_single($m_cache,'326','a',$ref->{originInfo}->{frequency}) if ($journal); |
304 |
|
|
|
305 |
dpavlin |
325 |
$nr++; |
306 |
|
|
print "$nr " if ($nr % 100 == 0); |
307 |
|
|
|
308 |
dpavlin |
326 |
# dump record |
309 |
dpavlin |
328 |
my $bib_level = "m"; |
310 |
|
|
$bib_level = "s" if ($journal); |
311 |
|
|
my $m=$marc->createrecord({leader=>"00000na".$bib_level." 2200000 a 4500"}); |
312 |
dpavlin |
327 |
|
313 |
|
|
foreach my $fld (keys %{$m_cache->{array}}) { |
314 |
|
|
foreach my $arr (@{$m_cache->{array}->{$fld}}) { |
315 |
|
|
#print "array = ",Dumper($arr); |
316 |
|
|
my ($i1,$i2); |
317 |
|
|
# do we have indicators? |
318 |
|
|
if ($fld =~ m/^(.+)\t(.)(.)$/) { |
319 |
|
|
$fld = $1; |
320 |
|
|
($i1,$i2) = ($2,$3); |
321 |
|
|
} |
322 |
|
|
$marc->addfield({record=>$m, |
323 |
|
|
field=>$fld, |
324 |
|
|
i1=>$i1, |
325 |
|
|
i2=>$i2, |
326 |
|
|
value=>$arr |
327 |
|
|
}); |
328 |
|
|
} |
329 |
|
|
} |
330 |
|
|
|
331 |
|
|
foreach my $fld (keys %{$m_cache->{single}}) { |
332 |
|
|
#print "single = ",Dumper($m_cache->{single}->{$fld}); |
333 |
|
|
my ($i1,$i2); |
334 |
|
|
# do we have indicators? |
335 |
|
|
if ($fld =~ m/^(.+)\t(.)(.)$/) { |
336 |
|
|
$fld = $1; |
337 |
|
|
($i1,$i2) = ($2,$3); |
338 |
|
|
} |
339 |
dpavlin |
326 |
$marc->addfield({record=>$m, |
340 |
|
|
field=>$fld, |
341 |
dpavlin |
327 |
i1=>$i1, |
342 |
|
|
i2=>$i2, |
343 |
|
|
value=>$m_cache->{single}->{$fld} |
344 |
dpavlin |
326 |
}); |
345 |
|
|
} |
346 |
|
|
|
347 |
dpavlin |
327 |
$m_cache = {}; |
348 |
|
|
|
349 |
dpavlin |
325 |
$t->purge; # frees the memory |
350 |
|
|
} |
351 |
|
|
|