# b.parse.pl
# Victor Liu See-le - mailto:victor@n-gon.com
# Helper file for parsing through XBEL files.
#
# Information on the XML Bookmark Exchange Language (XBEL) is at:
#        http://www.python.org/topics/xml/xbel/
#
# Copyright (C) 2001 Victor Liu See-le
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version. 
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details. 
# 
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.

require "b.header.pl";

###############################################################################
# Following code provides Shallow Parsing of XML                              #

# REX/Perl 1.0 
# Robert D. Cameron "REX: XML Shallow Parsing with Regular Expressions",
# Technical Report TR 1998-17, School of Computing Science, Simon Fraser 
# University, November, 1998.
# Copyright (c) 1998, Robert D. Cameron. 
# The following code may be freely used and distributed provided that
# this copyright and citation notice remains intact and that modifications
# or additions are clearly identified.

$TextSE = "[^<]+";
$UntilHyphen = "[^-]*-";
$Until2Hyphens = "$UntilHyphen(?:[^-]$UntilHyphen)*-";
$CommentCE = "$Until2Hyphens>?";
$UntilRSBs = "[^\\]]*](?:[^\\]]+])*]+";
$CDATA_CE = "$UntilRSBs(?:[^\\]>]$UntilRSBs)*>";
$S = "[ \\n\\t\\r]+";
$NameStrt = "[A-Za-z_:]|[^\\x00-\\x7F]";
$NameChar = "[A-Za-z0-9_:.-]|[^\\x00-\\x7F]";
$Name = "(?:$NameStrt)(?:$NameChar)*";
$QuoteSE = "\"[^\"]*\"|'[^']*'";
$DT_IdentSE = "$S$Name(?:$S(?:$Name|$QuoteSE))*";
$MarkupDeclCE = "(?:[^\\]\"'><]+|$QuoteSE)*>";
$S1 = "[\\n\\r\\t ]";
$UntilQMs = "[^?]*\\?+";
$PI_Tail = "\\?>|$S1$UntilQMs(?:[^>?]$UntilQMs)*>";
$DT_ItemSE = "<(?:!(?:--$Until2Hyphens>|[^-]$MarkupDeclCE)|\\?$Name(?:$PI_Tail))|%$Name;|$S";
$DocTypeCE = "$DT_IdentSE(?:$S)?(?:\\[(?:$DT_ItemSE)*](?:$S)?)?>?";
$DeclCE = "--(?:$CommentCE)?|\\[CDATA\\[(?:$CDATA_CE)?|DOCTYPE(?:$DocTypeCE)?";
$PI_CE = "$Name(?:$PI_Tail)?";
$EndTagCE = "$Name(?:$S)?>?";
$AttValSE = "\"[^<\"]*\"|'[^<']*'";
$ElemTagCE = "$Name(?:$S$Name(?:$S)?=(?:$S)?(?:$AttValSE))*(?:$S)?/?>?";
$MarkupSPE = "<(?:!(?:$DeclCE)?|\\?(?:$PI_CE)?|/(?:$EndTagCE)?|(?:$ElemTagCE)?)";
$XML_SPE = "$TextSE|$MarkupSPE";

sub ShallowParse { 
  my($XML_document) = @_;
  return $XML_document =~ /$XML_SPE/g;
}

# End of borrowed code                                                        #
###############################################################################


@Possible_Xbel_Elem = ("xbel", "folder", "bookmark", "title", "desc", 
                       "comment", "info");
@Possible_Xbel_EmptyElem = ("metadata", "alias", "separator");
@Possible_Xbel_PCData = ("title", "desc", "comment");


sub set_verbose {
  $verbose = 1;
}

sub open_and_lock_xml_file {
  open (XML_FILE, "+<$xml_file") or die "Cannot open $xml_file: $!\n";
  flock ($xml_file, 2);
}

sub open_xml_file {
  open (XML_FILE, "+<$xml_file") or die "Cannot open $xml_file: $!\n";
}

sub unlock_and_close_xml_file {
  flock ($xml_file, 8);
  close (XML_FILE);
}

sub close_xml_file {
  close (XML_FILE);
}

sub get_text_indent {
  return $tab x &get_depth;
}

sub shallow_parse_xml {
  my $bufline;

  # Read in XML and Shallow Parse. This does a top-level parsing.
  # This should be called after the $xml_file has been opened.
  $xml_file_save = '';
  while ($bufline = <XML_FILE>) 
    { $xml_file_save .= $bufline; }
  @parsed = &ShallowParse($xml_file_save);
}

sub write_xml {
  return if (! @parsed);
  
  seek (XML_FILE, 0, 0);
  truncate (XML_FILE, 0);
  print XML_FILE @parsed; 
  close (XML_FILE);
}

sub write_xml_with_backup {
  return if (! @parsed);

  # Backup
  if (open (BACKUP, ">$xml_file.bak")) {
    print BACKUP $xml_file_save;
    close (BACKUP);
  }
  seek (XML_FILE, 0, 0);
  truncate (XML_FILE, 0);
  print XML_FILE @parsed; 
  close (XML_FILE);
}

sub set_handler {
  my ($type, $subref) = @_;
  $handlers{$type} = $subref;
}

sub handle {
  my $type = $_[0];
  if (defined $handlers{$type}) { 
    my $subref = $handlers{$type}; 
    &$subref;
  }
}

sub process_xbel {

  &shallow_parse_xml if (! @parsed);

  # Iterate through array of parsed elements. Identify type of element, and
  # call handler based on element type. &set_handler must be called 
  # first to make &process_xbel useful.

  for $parsed_index (0 .. $#parsed) {
    local $_ = $parsed[$parsed_index];
    $cur_parse = $_;

    last if (! defined($_));

    # Check if non-white space.
    if (/\S+/) {

      # Check if tag or just text; tag opens with left angle bracket.
      # Tags assumed to be already trimmed of exterior whitespace.
      if (/^</) {

        # Examine type of tag.
        if (/^<!--/) {
          # Comment.
          print "COMMENT:$_\n" if $verbose;
          &handle('Comment');
        }
        elsif (/^<\?/) {
          # Ignore Processing Instruction.
          print "PI:$_\n" if $verbose;
          &handle('PI');
        }
        elsif (/^<!DOCTYPE/) {
          # Ignore Doctype.
          print "DOCTYPE:$_\n" if $verbose;
          &handle('Doctype');
        }
        elsif (/^<!\[CDATA\[/) {
          # Ignore CData.
          print "CDATA:$_\n" if $verbose;
          &handle('CData');
        }
        else {

          # Element tag: either
          #   Element Start Tag: Ends with ">?"
          #   Element End Tag: Begins with "</", ends with ">?"
          #   Empty Element Tag: Ends with "/>?"
          # Info on the current node is stored in a stack @node_types.
          # The corresponding attributes for each item in @node_types 
          # is stored in the stack @cur_attributes.

          if (m|^</|) {

            # Begins with "</": Element End Tag.
            print "ELEMENT-END:$_\n" if $verbose;

            # Remove outside brackets for parsing.
            my $tag = $';
            $tag =~ />?$/ and $tag = $`;

            ($cur_type, $cur_attributes) = &parse_element_tag($tag);

            &handle("Element-End"); # generic

            foreach my $type (@Possible_Xbel_Elem) {
              &handle("Element-End.$type"), last if ($cur_type eq $type);
            }

            # Remove (up to) the node corresponding to this element.
            while (my $node = shift @node_types, 
                   shift @node_attributes,
                   shift @node_indices,
                   shift @node_ids) {
              if ($node eq $cur_type) { last; } 
            }
          }
          elsif (m|/>$|) {

            # Ends with "/>?": Empty Element Tag.
            print "ELEMENT-EMPTY:$_\n" if $verbose;

            # Remove outside brackets for parsing.
            my $tag = $`; 
            $tag =~ /^</ and $tag = $';

            ($cur_type, $cur_attributes) = &parse_element_tag($tag);

            # Parse attributes for ID.
            my %att = &parse_attributes($cur_attributes);
            $cur_id = $att{'id'} ? $att{'id'} : '';

            # Add to current node tree.
            unshift(@node_types, $cur_type);
            unshift(@node_attributes, $cur_attributes); 
            unshift(@node_indices, $parsed_index); 
            unshift(@node_ids, $cur_id); 

            &handle("Element-Empty"); # generic

            foreach my $type (@Possible_Xbel_EmptyElem) {
              &handle("Element-Empty.$type"), last if ($cur_type eq $type);
            }

            # Now remove from node; that was fun.
            shift @node_types; 
            shift @node_attributes;
            shift @node_indices;
            shift @node_ids;

          }
          elsif (m|>?$|) {

            # Element Start Tag.
            print "ELEMENT-START:$_\n" if $verbose;
            
            # Remove outside brackets for parsing.
            my $tag = $`; 
            $tag =~ /^</ and $tag = $';

            ($cur_type, $cur_attributes) = &parse_element_tag($tag);

            # Parse attributes for ID.
            my %att = &parse_attributes($cur_attributes);
            $cur_id = $att{'id'} ? $att{'id'} : '';

            # Add to current node tree.
            unshift(@node_types, $cur_type);
            unshift(@node_attributes, $cur_attributes); 
            unshift(@node_indices, $parsed_index); 
            unshift(@node_ids, $cur_id); 

            &handle("Element-Start"); # generic

            foreach my $type (@Possible_Xbel_Elem) {
              &handle("Element-Start.$type"), last if ($cur_type eq $type);
            }

          }
          else {
            # Unknown. Will not get here.
            die "Unknown ELEMENT tag: $_\n";
          }

          print "  NAME--$cur_type" if $verbose;
          print "  ATTRIB--$cur_attributes\n" if $verbose;

        }
      }
      else {

        # Not a tag. Should be text between tags.
        print "TEXT[$_]\n" if $verbose;

        $cur_text = $_;

        &handle("Text"); # generic

        foreach my $pcdata (@Possible_Xbel_PCData) {
          if ($node_types[0] eq $pcdata) {
            foreach my $name (@Possible_Xbel_Elem) {
              &handle("Text.$name.$pcdata"), last if ($node_types[1] eq $name);
            }
          }
        }

      }
    }
    else {

      # whitespace
      &handle("whitespace"); 
    }

  }
}

sub parse_element_tag {
  my $tag = $_[0];
  $tag =~ /^$Name/;
  my $name = $&;
  my $attributes = $';
  $attributes =~ s/^\s*()/$1/; # Chop opening whitespace if any
  return ($name, $attributes);
}


sub parse_attributes {
  my($att_string, %ass);
  $att_string = $_[0];
  while ($att_string =~ /($Name)\w*=\w*($AttValSE)/) {
    $att_string = $';
    my($key, $val) = ($1, $2);
    ($val =~ s/^"(.*)"$/$1/) or ($val =~ s/^'(.*)'$/$1/);
    $ass{$key} = $val;
  }
  return %ass;
}

sub get_type {
  return $cur_type;
}

sub get_attributes {
  return $cur_attributes;
}

sub get_id {
  return $cur_id;
}

sub ensure_valid_id {

  # Check if this ID exists. If it exists, simply return it. If not, create
  # a new one of the form 'type.#', where # is unique for this XBEL doc.

  return $_[0] if $_[0];

  my $idnum = &get_next_idnum;
  my $id = $cur_type . "." . $idnum;

  # Update attributes.
  $cur_attributes = &set_attribute($cur_attributes, 'id', $id);
  shift(@node_attributes); 
  unshift(@node_attributes, $cur_attributes); 
  my $s = qq[<$cur_type $cur_attributes>];
  &splice_xbel($parsed_index, 1, $s);
  
  return $id;
}

sub get_next_idnum {
  $highest_idnum++;

  # Check if this new idnum is really unique.
  while (grep(/id=['"][A-Za-z0-9_-]+\.$highest_idnum['"]/, @parsed)) 
    { $highest_idnum++; }

  return $highest_idnum;
}
    
sub get_index {
  return $parsed_index;
}

sub get_parents_type {
  return $node_types[1];
}

sub get_parents_attributes {
  return $node_attributes[1];
}

sub get_parents_id {
  return $node_ids[1];
}

sub get_parents_index {
  return $node_indices[1];
}

sub get_node_type {
  return $node_types[$_[0]];
}

sub get_node_attributes {
  return $node_attributes[$_[0]];
}

sub get_node_id {
  return $node_ids[$_[0]];
}

sub get_node_index {
  return $node_indices[$_[0]];
}

sub get_text {
  return &trim($cur_text);
}

sub get_depth {
  return $#node_types;
}

sub get_folder_level {
  # Folder level determined by number of folder elements in node stack.
  my($item, $occurrences);
  $occurrences = 0;
  foreach $item (@node_types) {
    $item eq "folder" and $occurrences ++;
  }
  return $occurrences;
}

sub set_attribute {
  my($old_att, $set_var, $set_val) = @_;
  my($get_var, $get_val, $new_att, $exists);

  while ($old_att =~ /($Name)\w*=\w*($AttValSE)/) {
    $old_att = $';
    ($get_var, $get_val) = ($1, $2);
    if ($get_var eq $set_var) {
      $new_att .= qq[$set_var="$set_val" ];
      $exists = 1;
      last;
    }
    else {
      $new_att .= "$& ";
    }
  }
  $new_att .= $old_att;
  $new_att .= qq[$set_var="$set_val"] if (! $exists); 
  return &trim($new_att);
}

sub splice_xbel {
  $xml_changed = 1;
  return splice(@parsed, shift @_, shift @_, @_);
}

sub get_cur_parse {
  return $cur_parse;
}

sub trim {
  my $s = $_[0];
  $s =~ /^(\s*)/;
  $s = $';
  $s =~ /(\s*)$/;
  $s = $`;
  return $s;
}

sub xml_has_changed {
  return $xml_changed;
}

sub set_working_xml_file {
  $xml_file = $_[0];
}

sub reset_xbel {
  $xml_changed = 0;
  @parsed = ();
}

1;
