星火微课系统客户端
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

pdf_rbld.ps 15KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344
  1. % Copyright (C) 2002 Artifex Software, Inc. All rights reserved.
  2. %
  3. % This software is provided AS-IS with no warranty, either express or
  4. % implied.
  5. %
  6. % This software is distributed under license and may not be copied,
  7. % modified or distributed except as expressly authorized under the terms
  8. % of the license contained in the file LICENSE in this distribution.
  9. %
  10. % For more information about licensing, please refer to
  11. % http://www.ghostscript.com/licensing/. For information on
  12. % commercial licensing, go to http://www.artifex.com/licensing/ or
  13. % contact Artifex Software, Inc., 101 Lucas Valley Road #110,
  14. % San Rafael, CA 94903, U.S.A., +1(415)492-9861.
  15. % $Id: pdf_rbld.ps 9175 2008-10-19 20:32:03Z alexcher $
  16. % pdf_rbld.ps - Rebuilding of broken PDF files (xref errors)
  17. % This module contains routines that are used if we detect an error
  18. % while reading the xref tables. These routines will scan the file and
  19. % build an xref table by finding the objects. We also need to find the
  20. % appropriate trailer dictionary. Note: One procedure is also used
  21. % even if we do not need to rebuild a PDF file.
  22. %
  23. % This module cannot rebuild a PDF file which has had errors created inside
  24. % of objects or binary data streams. It often succeeds with files that
  25. % have had its end of lines converted between unix and dos versions.
  26. % if true --> we have an object with duplicate object and generation numbers.
  27. /dup_obj_gen_num false def
  28. % Note: This procedure is also used by non-rebuild code.
  29. % Store a line in the xref array (Actually Objects and Generations arrays)
  30. % <obj num> (strm num> <obj loc> <gen num> <rebuild>
  31. % setxrefentry <obj num> strm num> <obj loc> <gen num>
  32. /setxrefentry
  33. {
  34. 5 1 roll
  35. dup 65535 or 65535 ne {
  36. ( **** Warning: Generation number out of 0..65535 range, assuming 0.\n)
  37. pdfformaterror
  38. pop 0
  39. } if
  40. % We store generation numbers as value + 1
  41. % We reserve 0 to indicate an free xref entry
  42. 1 add % increment generation number
  43. % To save space, generations numbers are stored in a lstring unless we
  44. % find a generation number greater than 255. If so then transfer to
  45. % an larray.
  46. dup 255 gt {
  47. Generations ltype /stringtype eq { % Convert Generations to an larray.
  48. larray Generations llength lgrowto dup % Create new larray
  49. 0 1 2 index llength 1 sub { % Copy from old lstring to new larray
  50. Generations 1 index lget lput dup
  51. } for
  52. pop
  53. /Generations exch store % Save new Generations larray
  54. } if
  55. } if
  56. % Verify that the new values are for a new object. If the current
  57. % entry is null then we have a new entry.
  58. Objects 4 index lget null eq {
  59. ObjectStream 4 index 4 index cvx lput % Save ObjectStream object number
  60. Objects 4 index 3 index cvx lput % Save object location
  61. Generations 4 index 2 index lput % Save geenration number
  62. } {
  63. % Verify that the new entry has at least as high a generaton number
  64. % We accept equal entry number because we have found PDF files in
  65. % which there are multiple objects with the same object and entry
  66. % numbers. The normal xref logic only accepts the first such
  67. % entry that it finds. However the 'rebuild PDF' logic can find
  68. % both such entries. The correct one is usually the last one.
  69. Generations 4 index lget 1 index le {
  70. ObjectStream 4 index 4 index cvx lput % Save ObjectStream object number
  71. Objects 4 index 3 index cvx lput % Save object location
  72. Generations 4 index 2 index lput % Save geenration number
  73. } if
  74. % Set error flag if we have equal object and generation numbers
  75. Generations 4 index lget 1 index eq { /dup_obj_gen_num true def } if
  76. } 8 -1 roll { ifelse } { pop if } ifelse % Run 'else' only when rebuilding.
  77. } bind def
  78. % Print the contents of the xref array. This actually consists of three
  79. % arrays (Objects, Generations, and ObjectStream). All three are larrays.
  80. % larrays are a special Ghostscript object which can be arrays with more
  81. % than 64k elements.
  82. /print_xref % - print_xref -
  83. { 0 1 Objects llength 1 sub % stack: 0 1 <number of objects - 1>
  84. { dup =only % print object number
  85. ( ) print
  86. dup Generations exch lget 1 sub =only % print Generation number
  87. ( ) print
  88. dup ObjectStream exch lget ==only % print ObjectStream object number
  89. ( ) print
  90. Objects exch lget === % print object location
  91. } for
  92. flush
  93. } bind def
  94. % Get token from string and check its type
  95. % <string> <type> typed_token <false> % no token or not match
  96. % <string> <type> typed_token <obj> <last> <true> % matching token type
  97. % Where last is the string remainder
  98. /typed_token
  99. { exch
  100. token_nofail % get token
  101. {
  102. dup type % stack: type last token type
  103. 4 -1 roll eq { % stack: last token bool
  104. exch true % desired object found - set exit status
  105. } {
  106. pop pop false % not type - clear stack, set exit status
  107. } ifelse
  108. } {
  109. pop false % no token - pop type, set exit status
  110. } ifelse % check if we got token
  111. } bind def
  112. % Allocate space for post_eof_count to be bound into procedures below.
  113. /post_eof_count 0 def
  114. % We want the location of the trailer dictionary at the start of file.
  115. % First we will find the xref. Then we will skip over the xref entries
  116. % to the trailer.
  117. /search_start_trailer % - search_start_trailer <trailer loc>
  118. { % Read the first 300 bytes and check for xref
  119. PDFfile 0 setfileposition
  120. PDFfile bytesavailable post_eof_count sub % location of end of data
  121. 300 .min % block size to read
  122. dup string 0 1 4 -1 roll 1 sub
  123. { 2 copy PDFfile read pop put pop } for
  124. (xref) search {
  125. % found 'xref'
  126. exch pop exch pop length 4 add PDFfile exch setfileposition
  127. PDFfile token pop % get starting entry - or 'trailer'
  128. (trailer) ne { % if we do not already have 'trailer'
  129. PDFfile token pop % get number of entries
  130. PDFfile token pop pop % this moves us into the middle of the first entry
  131. 25 string exch % define working string for readline
  132. { PDFfile 1 index readline pop pop
  133. } repeat % skip entries
  134. pop % pop working string
  135. PDFfile token pop pop % get 'trailer'
  136. PDFfile fileposition % get file position
  137. } if
  138. } {
  139. pop search_end_trailer % no xref, should not happen, search end of file
  140. } ifelse
  141. } bind def
  142. % We want the location of the trailer dictionary at the end of file.
  143. % We will read the last block of data and search for the final occurance
  144. % of the word 'trailer'
  145. /search_end_trailer % - search_end_trailer <trailer loc>
  146. { % Position to read block of data from the end of the file. Note: We ignore
  147. % anything past the last %%EOF since this is not PDF data.
  148. PDFfile 0 setfileposition
  149. PDFfile bytesavailable post_eof_count sub % location of end of data
  150. dup 65535 .min % block size to read
  151. % stack: <file end pos> <block size>
  152. % move file position to the start of the block
  153. 2 copy sub PDFfile exch setfileposition
  154. % read block of data
  155. dup string 0 1 4 -1 roll 1 sub { 2 copy PDFfile read pop put pop } for
  156. % search for last occurance of 'trailer'
  157. (trailer) { search not { exit } if pop } loop
  158. % determine where the trailer is in the file
  159. % trailer loc = end loc - remaing string length
  160. length sub
  161. } bind def
  162. % We want to find the trailer dictionary. There is a trailer dictionary
  163. % for each xref object list. We only want the trailer dictionary associated
  164. % with the first xref object list. In theory this can be anywhere in the
  165. % file. However since we are trying to repair a broken file, we cannot simply
  166. % follow the xref links. So we are falling back to a simple strategy. We
  167. % find the specified location of the first xref list. If its location is in
  168. % the first half of the file then we search for the first trailer dictionary
  169. % at the start of the file. Otherwise we search for the last trailer at the
  170. % end of the file.
  171. /search_trailer % - search_trailer -
  172. { % Find the 'startxref' and associated position at the end of the file.
  173. % Position to read block of data from the end of the file. Note: We
  174. % actually end at the end of the last %%EOF since this is the end of the
  175. % useful PDF data. (Some files contain trailing garbage.)
  176. PDFfile 0 setfileposition
  177. PDFfile bytesavailable % size of file
  178. post_eof_count sub dup % location of end of last %%EOF
  179. dup 4096 .min % block size to read
  180. % stack: <useful file size> <useful file size file> <block size>
  181. % move file position to the start of the block
  182. 2 copy sub PDFfile exch setfileposition
  183. % read block of data
  184. dup string 0 1 4 -1 roll 1 sub { 2 copy PDFfile read pop put pop } for
  185. % search for last occurance of 'startxref'
  186. false % Assume that startxref not present
  187. exch (startxref) {
  188. search not { exit } if % Exit loop when no more startxref's
  189. pop 3 -1 roll pop true 3 1 roll % Idicate that we have found starxref
  190. } loop
  191. exch % Exch last string and 'found' flag
  192. {
  193. % determine where the startxref is in the file
  194. % 'startxref' loc = end loc - remaing string length - 9 bytes
  195. length sub 9 sub
  196. % move the file to this position and read startxref and position
  197. PDFfile exch setfileposition PDFfile token
  198. pop pop PDFfile token pop
  199. } {
  200. % startxref not found. We will search the end of the file for trailer.
  201. pop pop PDFfilelen
  202. } ifelse
  203. % compare xref position to 1/2 the length of the file and search for trailer
  204. exch 2 div lt { search_start_trailer } { search_end_trailer } ifelse
  205. % get the trailer
  206. PDFfile exch setfileposition % set to the specified trailer location
  207. /dictlevelcount 0 def
  208. PDFfile traileropdict .pdfrun % read trailer info
  209. /Trailer exch def
  210. } bind def
  211. % This routine will determine if there is stuff after the %%EOF. There is
  212. % supposed to be only a line termination. However many real life files
  213. % contain some garbage. This routine checks how much. We then ignore this
  214. % stuff when we are scanning for objects.
  215. /determine_post_eof_count % - determine_post_eof_count <count>
  216. { % Position to read block of data from the end of the file.
  217. PDFfilelen % size of file
  218. dup 4096 .min % file_size block_size
  219. dup 3 1 roll sub % block_size file_size-block_size
  220. PDFfile exch setfileposition % block_size
  221. string PDFfile exch readstring pop % ()
  222. % search for last occurance of 'startxref', '%%EOF' is often damaged
  223. (startxref) {
  224. search not { exit } if pop
  225. } loop
  226. % how much is left = remaining string length
  227. % Now search for %%EO or try to read a number after 'startxref'.
  228. (%%EO) search {
  229. pop pop
  230. } {
  231. % Look for a number after startxref
  232. { dup token { pop exch pop } if
  233. } stopped {
  234. pop
  235. } if
  236. } ifelse
  237. length
  238. } bind def
  239. % This routine will scan a file searaching for object locations to build
  240. % an alternate version of the data in the xref tables.
  241. % Its purpose is to provide a basis for an xref fixing facility.
  242. /search_objects % - search_objects -
  243. { % Initialize the Objects, Generations, etc. larrays
  244. initPDFobjects
  245. % reset duplicate object and generation numbers error flag
  246. /dup_obj_gen_num false def
  247. % Determine how many bytes are in the file after the final %%EOF
  248. /post_eof_count determine_post_eof_count def
  249. % Start at the beginning of the file
  250. PDFfile 0 setfileposition
  251. % Create a working string (and also store its length on stack). We are
  252. % using a maximum size string size the logic below wants a recovered object
  253. % to fit into our working string.
  254. 65535 dup string
  255. { % Now loop through the entire file lloking for objects
  256. PDFfile fileposition % save current file position
  257. % When we get near the end of the file, we use a smaller interval of
  258. % our working string to prevent reading past the end. (See comments on
  259. % EOF testing below.)
  260. PDFfile bytesavailable post_eof_count sub 10 sub dup 4 index lt {
  261. 2 index 0 3 -1 roll getinterval % near EOF, use interval of string
  262. } { pop 1 index % not near end, use full working string
  263. }ifelse
  264. % Read a line from file. If the line does not fit into our working string,
  265. % or any other error, then we will discard it.
  266. PDFfile exch { readline } .internalstopped
  267. { pop pop false } if % indicate no string if we stopped
  268. { % stack: <length> <working_str> <loc> <string>
  269. % Now that we have line, get obj num, ref num, and 'obj'. Verify that each
  270. % of these is correct type.
  271. /integertype typed_token { % get obj number
  272. /integertype typed_token { % get ref number
  273. /nametype typed_token { % get 'obj' text
  274. pop % pop remaining string
  275. /obj eq { % verify name is 'obj'
  276. % make sure we have room in the arrays. We work in increments
  277. % of 20 each time we increase the size.
  278. 1 index 20 add 20 idiv 20 mul
  279. growPDFobjects
  280. % save xref parameters into ObjectStream, Objects and Generations
  281. 1 index 0 % rearrange parms for setxrefentry
  282. 4 index PDFoffset sub 3 index
  283. //true setxrefentry % save parameters
  284. pop pop pop pop % clear parameters
  285. } if % check if name is 'obj'
  286. } if % check if we got 'obj" string
  287. pop % remove ref number
  288. } if % check if we got ref number
  289. pop % remove obj number
  290. } if % check if we got object number
  291. } if % check if got a string from readline
  292. pop % remove location
  293. % Check if we are approaching the end of the file. We do not want to
  294. % read past the end of the file since that closes it. We actually stop
  295. % 10-20 bytes early since there cannot be an object that close to the end.
  296. % (There is a Trailer dictionary, etc. at the end of the file.)
  297. PDFfile bytesavailable post_eof_count sub 20 lt { exit } if
  298. } loop % loop through the entire file
  299. pop pop % remove working string and its length
  300. % Output warning if we have two objects with the same object and generation
  301. % numbers.
  302. dup_obj_gen_num {
  303. ( **** Warning: There are objects with matching object and generation\n)
  304. pdfformaterror
  305. ( **** numbers. The accuracy of the resulting image is unknown.\n)
  306. pdfformaterror
  307. } if
  308. } bind def
  309. % Print warning message because we found a problem while reading the xref
  310. % tables
  311. /print_xref_warning
  312. { ( **** Warning: An error occurred while reading an XREF table.\n)
  313. pdfformaterror
  314. ( **** The file has been damaged. This may have been caused\n)
  315. pdfformaterror
  316. ( **** by a problem while converting or transfering the file.\n)
  317. pdfformaterror
  318. ( **** Ghostscript will attempt to recover the data.\n)
  319. pdfformaterror
  320. } bind def
  321. % Attempt to recover the XRef data. This is called if we have a failure
  322. % while reading the normal XRef tables. This routine usually works
  323. % only for pre PDF1.5 versions of PDF files.
  324. /recover_xref_data % - recover_xref_data -
  325. { print_xref_warning % Print warning message
  326. count pdfemptycount sub { pop } repeat % remove anything left by readxref
  327. search_objects % Search for objects
  328. } bind def