1 |
#!/usr/bin/perl -w |
2 |
|
3 |
=head1 NAME |
4 |
|
5 |
mods2marc.pl - convert MODS XML back to MARC (ISO2709) |
6 |
|
7 |
=head1 SYNOPSIS |
8 |
|
9 |
mods2marc.pl mods.xml export.marc |
10 |
|
11 |
=head1 DESCRIPTION |
12 |
|
13 |
This script will convert MODS format |
14 |
L<http://www.loc.gov/standards/mods/> |
15 |
back to MARC (ISO2709) format. |
16 |
|
17 |
Since conversion back to MARC is not simple, lot of things are hard-coded |
18 |
in this script. |
19 |
|
20 |
This script B<is somewhat specific> to MODS export from |
21 |
Faculty of Electrical Engineering and Computing |
22 |
so you might want to edit it |
23 |
|
24 |
=head1 WARNING |
25 |
|
26 |
This script is in state of flux. |
27 |
|
28 |
=cut |
29 |
|
30 |
use strict; |
31 |
use XML::Twig; |
32 |
use XML::Simple; |
33 |
use MARC; |
34 |
use Text::Iconv; |
35 |
|
36 |
use Data::Dumper; |
37 |
|
38 |
my $xml_file = "/data/tehnika/fer/all.xml"; |
39 |
$xml_file = "/data/tehnika/fer/modsFER_1.xml"; |
40 |
my $marc_file = "fer.marc"; |
41 |
|
42 |
$|=1; |
43 |
my $nr = 0; |
44 |
|
45 |
my $marc = MARC->new; |
46 |
|
47 |
my $twig=XML::Twig->new( |
48 |
twig_roots => { 'mods' => \&item }, |
49 |
output_encoding => 'iso-8859-2', |
50 |
); |
51 |
|
52 |
my $utf2iso = Text::Iconv->new("UTF-8", "ISO-8859-2"); |
53 |
|
54 |
$twig->parsefile($xml_file); |
55 |
$twig->purge; |
56 |
|
57 |
$marc->output({file=>"> $marc_file",'format'=>"usmarc"}); |
58 |
|
59 |
sub item { |
60 |
my( $t, $elt)= @_; |
61 |
|
62 |
my $xml=$elt->xml_string; |
63 |
my $ref = XMLin("<xml>".$xml."</xml>", |
64 |
ForceArray => [ |
65 |
'name', |
66 |
'classification', |
67 |
'topic', |
68 |
'udc', |
69 |
], |
70 |
KeyAttr => { |
71 |
'namePart' => 'type', |
72 |
'identifier' => 'type', |
73 |
'classification' => 'authority', |
74 |
'namePart' => 'type', |
75 |
'role' => 'type', |
76 |
}, |
77 |
GroupTags => { |
78 |
'place' => 'placeTerm', |
79 |
'physicalDescription' => 'extent', |
80 |
'roleTerm' => 'content', |
81 |
}, |
82 |
ContentKey => '-content', |
83 |
); |
84 |
|
85 |
my $m=$marc->createrecord(); |
86 |
|
87 |
sub marc_arr { |
88 |
my $m = shift || die "no marc record?"; |
89 |
my $fld = shift || die "no marc field?"; |
90 |
my $sf = shift || ''; |
91 |
|
92 |
return if (! @_); |
93 |
|
94 |
my @a; |
95 |
foreach (@_) { |
96 |
next if (! $_); |
97 |
push @a,$sf; |
98 |
# push @a,$utf2iso->convert($_) || $_; |
99 |
push @a,$_; |
100 |
} |
101 |
|
102 |
return if (! @a); |
103 |
|
104 |
# print "storing $fld: ",join("|",@a),"\n"; |
105 |
|
106 |
$marc->addfield({record=>$m, |
107 |
field=>$fld, |
108 |
# i1=>$i1, |
109 |
# i2=>$i2, |
110 |
value=>\@a}); |
111 |
} |
112 |
|
113 |
marc_arr($m,'610','a',@{$ref->{subject}->{topic}}); |
114 |
|
115 |
my $fld = '700'; |
116 |
|
117 |
foreach my $name (@{$ref->{name}}) { |
118 |
my $role = $name->{role}->{roleTerm}->{content}; |
119 |
next if (! $role); |
120 |
if ($role eq "author") { |
121 |
marc_arr($m,$fld,'a',$name->{namePart}->{family}); |
122 |
marc_arr($m,$fld,'b',$name->{namePart}->{given}); |
123 |
marc_arr($m,$fld,'4',$role); |
124 |
|
125 |
# first author goes in 700, others in 701 |
126 |
$fld = '701'; |
127 |
} elsif ($role eq "editor" or $role eq "illustrator") { |
128 |
marc_arr($m,'702','a',$name->{namePart}->{family}); |
129 |
marc_arr($m,'702','b',$name->{namePart}->{given}); |
130 |
marc_arr($m,'702','4',$role); |
131 |
} else { |
132 |
die "FATAL: don't know how to map role '$role'" if ($role); |
133 |
} |
134 |
} |
135 |
|
136 |
my $note = $ref->{note}; |
137 |
|
138 |
if ($note) { |
139 |
foreach my $n (split(/\s*;\s+/, $note)) { |
140 |
if ($n =~ s/bibliogr:\s+//i) { |
141 |
marc_arr($m,'320','a',"Bibliografija: $n"); |
142 |
} elsif ($n =~ s/ilustr:\s+//i) { |
143 |
marc_arr($m,'215','c', $n); |
144 |
} else { |
145 |
marc_arr($m,'320','a',$n); |
146 |
} |
147 |
} |
148 |
} |
149 |
|
150 |
|
151 |
my $type = $ref->{identifier}->{type}; |
152 |
|
153 |
if ($type) { |
154 |
if ($type eq "isbn") { |
155 |
marc_arr($m,'010','a',$ref->{identifier}->{content}); |
156 |
} elsif ($type eq "issn") { |
157 |
marc_arr($m,'011','a',$ref->{identifier}->{content}); |
158 |
} else { |
159 |
die "unknown identifier type $type"; |
160 |
} |
161 |
} |
162 |
|
163 |
my $phy_desc = $ref->{physicalDescription}; |
164 |
if ($phy_desc) { |
165 |
my $tmp; |
166 |
foreach my $t (split(/\s*;\s+/, $phy_desc)) { |
167 |
if ($t =~ m/([^:]+):\s+(.+)$/) { |
168 |
$tmp->{$1} = $2; |
169 |
} else { |
170 |
die "can't parse $t"; |
171 |
} |
172 |
} |
173 |
my $data = $tmp->{pagin}; |
174 |
$data .= ", " if ($data); |
175 |
if ($tmp->{str}) { |
176 |
$data .= $tmp->{str}." str"; |
177 |
} |
178 |
marc_arr($m,'210','a', $data) if ($data); |
179 |
marc_arr($m,'210','d', $tmp->{visina}); |
180 |
} |
181 |
|
182 |
marc_arr($m,'001','',$ref->{recordInfo}->{recordIdentifier}); |
183 |
|
184 |
marc_arr($m,'200','a',$ref->{titleInfo}->{title}); |
185 |
marc_arr($m,'200','e',$ref->{titleInfo}->{subTitle}); |
186 |
|
187 |
marc_arr($m,'675','a',$ref->{classification}->{udc}); |
188 |
|
189 |
my $related = $ref->{relatedItem}->{type}; |
190 |
if ($related) { |
191 |
if ($related eq "series") { |
192 |
marc_arr($m,'675','a',$ref->{relatedItem}->{titleInfo}->{title}); |
193 |
marc_arr($m,'999','a',$ref->{relatedItem}->{titleInfo}->{partNumber}); |
194 |
} elsif ($related eq "preceding") { |
195 |
marc_arr($m,'430','a',$ref->{relatedItem}->{titleInfo}->{title}); |
196 |
} else { |
197 |
die "can't parse related item type $related" if ($related); |
198 |
} |
199 |
} |
200 |
|
201 |
marc_arr($m,'205','a',$ref->{originInfo}->{edition}); |
202 |
|
203 |
my $publisher = $ref->{originInfo}->{publisher}; |
204 |
if ($publisher =~ m,^(.+?)\s*/\s*(.+)$,) { |
205 |
marc_arr($m,'210','a', $2); |
206 |
marc_arr($m,'210','c', $1); |
207 |
} else { |
208 |
marc_arr($m,'210','c', $publisher); |
209 |
} |
210 |
|
211 |
marc_arr($m,'326','a',$ref->{originInfo}->{frequency}); |
212 |
marc_arr($m,'326','a',$ref->{originInfo}->{place}); |
213 |
|
214 |
marc_arr($m,'210','d',$ref->{originInfo}->{dateIssued}); |
215 |
|
216 |
$nr++; |
217 |
print "$nr " if ($nr % 100 == 0); |
218 |
|
219 |
$t->purge; # frees the memory |
220 |
} |
221 |
|
222 |
__END__ |
223 |
|
224 |
KNJIGA = { |
225 |
610a 'subject' => [ |
226 |
{ |
227 |
'topic' => [ |
228 |
'LIBRARIES-AUTOMATION', |
229 |
'ELECTRONIC DATA PROCESSING-LIBRARY SCIENCE' |
230 |
] |
231 |
} |
232 |
], |
233 |
'name' => [ |
234 |
{ |
235 |
'namePart' => { |
236 |
700b,701a... 'given' => 'Robert M.', |
237 |
700a,701b... 'family' => 'Hayes' |
238 |
}, |
239 |
'type' => 'personal', |
240 |
'role' => { |
241 |
'roleTerm' => { |
242 |
7004,7014... 'content' => 'author', |
243 |
'type' => 'text' |
244 |
} |
245 |
} |
246 |
}, |
247 |
{ |
248 |
'namePart' => { |
249 |
702b 'given' => 'Joseph', |
250 |
702a 'family' => 'Becker' |
251 |
}, |
252 |
'type' => 'personal', |
253 |
'role' => { |
254 |
'roleTerm' => { |
255 |
7024 'content' => 'editor', |
256 |
'type' => 'text' |
257 |
} |
258 |
} |
259 |
}, |
260 |
{ |
261 |
'namePart' => { |
262 |
702b 'given' => 'Joseph', |
263 |
702a 'family' => 'Becker' |
264 |
}, |
265 |
'type' => 'personal', |
266 |
'role' => { |
267 |
'roleTerm' => { |
268 |
7024 'content' => 'illustrator', |
269 |
'type' => 'text' |
270 |
} |
271 |
} |
272 |
} |
273 |
], |
274 |
'note' => 'bibliogr: 645-647; kazalo; ilustr: ilustr.', |
275 |
'identifier' => { |
276 |
010a 'content' => '0-471-36483-5', |
277 |
'type' => 'isbn' |
278 |
}, |
279 |
215a;215d 'physicalDescription' => 'str: 688; pagin: xvi; visina: 24. cm', |
280 |
001 'recordInfo' => { |
281 |
'recordIdentifier' => 'k-7996-8073' |
282 |
}, |
283 |
200a 'titleInfo' => { |
284 |
'title' => 'Handbook of data processing for libraries' |
285 |
}, |
286 |
'typeOfResource' => 'text', |
287 |
675a 'classification' => { |
288 |
'udc' => '=20' |
289 |
}, |
290 |
225a 'relatedItem' => { |
291 |
'titleInfo' => { |
292 |
'title' => 'A WILEY-BECKER & HAYES SERIES BOOK' |
293 |
}, |
294 |
'type' => 'series' |
295 |
}, |
296 |
'originInfo' => { |
297 |
'issuance' => 'monographic', |
298 |
205a 'edition' => '2.', |
299 |
210c/210a 'publisher' => 'MELVILLE PUBLISHING COMPANY /LOS ANGELES, CALIFORNIA/', |
300 |
210d 'dateIssued' => '1974' |
301 |
} |
302 |
}; |
303 |
|
304 |
|
305 |
|
306 |
CASOPIS = { |
307 |
'identifier' => { |
308 |
011a 'content' => '1041-5173', |
309 |
'type' => 'issn' |
310 |
}, |
311 |
'recordInfo' => { |
312 |
'recordIdentifier' => 'c-1' |
313 |
}, |
314 |
'titleInfo' => { |
315 |
'title' => 'DBMS - CLIENT/SERVER COMPUTING' |
316 |
}, |
317 |
'typeOfResource' => 'text', |
318 |
999a 'relatedItem' => { |
319 |
'titleInfo' => { |
320 |
'partNumber' => 'g. 1990, vol. 137, br. 5' |
321 |
}, |
322 |
'type' => 'series' |
323 |
}, |
324 |
'classification' => { |
325 |
'udc' => '=20' |
326 |
}, |
327 |
'originInfo' => { |
328 |
326a 'frequency' => 'mjeseļæ½no', |
329 |
'issuance' => 'continuing', |
330 |
210a 'place' => 'SAN MATEO, KANADA', |
331 |
210c 'publisher' => 'M&T PUBLISHING INC.' |
332 |
} |
333 |
}; |
334 |
|
335 |
|
336 |
=cut |
337 |
|