rubymark/document.rb

660 lines
17 KiB
Ruby

# frozen_string_literal: true
module RBMark
# Parser class
class Parser
def initialize(variants, default)
@default = default
@variants = variants
@markers = @variants.map { |x| [x.begin, x] }.to_h
end
# Parse text using the given variants
# @param text [String]
# @return [Array<::RBMark::DOM::DOMObject>]
def parse(text)
chunks = []
until text.nil? or text.empty?
before, chunk, text = get_chunk(text)
chunks.append(@default.parse(before)) unless before.empty?
next unless chunk
chunks.append(chunk)
end
chunks.flatten
end
private
def get_chunk(text)
element, match = get_element(text)
if element
chunk, after = finalize_element(text[match.offset(0)[0]..], element)
return [match.pre_match, chunk, after] if chunk
return [match.pre_match, nil, match.post_match]
end
[text, nil, nil]
end
def get_element(text)
@markers.filter_map do |marker|
[marker[1], text.match(marker[0])] if text.match(marker[0])
end.min_by { |x| x[1].offset(0)[0] }
end
def finalize_element(text, element)
match = text.match(element.end)
return nil, nil unless match
chunk_text = text[..(match.offset(0)[1] - 1)]
remaining_text = text[match.offset(0)[1]..]
[element.parse(chunk_text), remaining_text]
end
end
# Module for representing abstract object hierarchy
module DOM
# Abstract container
class DOMObject
class << self
# Hook for initializing variables
# @param subclass [Class]
def inherited(subclass)
super
@subclasses ||= []
@subclasses.append(subclass)
subclass.variants = @variants.dup
subclass.variants ||= []
subclass.default_class = @default_class
end
# Add potential sub-element variant
# @param cls [Class] DOMObject subclass
def variant(cls)
unless cls < ::RBMark::DOM::DOMObject
raise StandardError, "#{cls} is not a DOMObject subclass"
end
@variants.append(cls)
@subclasses&.each do |subclass|
subclass.variant(cls)
end
end
# Set default element class
# @param cls [Class] DOMObject subclass
def default(cls)
unless cls < ::RBMark::DOM::DOMObject
raise StandardError, "#{cls} is not a DOMObject subclass"
end
@default_class = cls
@subclasses&.each do |subclass|
subclass.default(cls)
end
end
# Parse text from the given context
# @param text [String]
# @return [self]
def parse(text)
parser = ::RBMark::Parser.new(@variants, @default_class)
instance = create
instance.append(*parser.parse(text))
instance
end
# Create a new instance of class or referenced class
# @return [self, Class]
def create
if @alt_for
@alt_for.new
else
new
end
end
attr_accessor :variants, :begin, :end, :default_class, :alt_for
end
def initialize
@content = nil
@children = []
@properties = {}
end
# Set certain property in the properties hash
# @param properties [Hash] proeprties to update
def property(**properties)
@properties.update(**properties)
end
# Add child to container
# @param child [DOMObject]
def append(*children)
unless children.all? { |x| x.is_a? DOMObject }
raise StandardError, "one of #{children.inspect} is not a DOMObject"
end
@children.append(*children)
end
# Insert a child into the container
# @param child [DOMObject]
# @param index [Integer]
def insert(index, child)
raise StandardError, "not a DOMObject" unless child.is_a? DOMObject
@children.insert(index, child)
end
# Delete a child from container
# @param index [Integer]
def delete_at(index)
@children.delete_at(index)
end
# Get a child from the container
# @param key [Integer]
def [](key)
@children[key]
end
# Set text content of a DOMObject
# @param text [String]
def content=(text)
raise StandardError, "not a String" unless text.is_a? String
@content = text
end
# Get length of text contents
# @return [Integer]
def length
@children.map(&:length).sum
end
# Get text content of a DOMObject
# @return [String, nil]
attr_reader :content, :children, :properties
end
# Inline text
class Text < DOMObject
# Stub parser for inline text element
# @param text [String]
# @return [self]
def self.parse(text)
instance = new
instance.content = text.gsub("\n", ' ').gsub(/\s+/, " ")
instance
end
# Get length of inline text
# @return [Integer]
def length
@content.length
end
end
# Inline preformatted text
class InlinePre < DOMObject
self.begin = /(?<!\\)`(?!`)/
self.end = /`.+?`/
# Stub parser for inline text element
# @param text [String]
# @return [self]
def self.parse(text)
instance = new
element = ::RBMark::DOM::Text.new
element.content = text
instance.append element
instance
end
# Get length of inline preformatted text
# @return [Integer]
def length
@content ? @content.length : 0
end
end
# Infline formattable text
class InlineFormattable < DOMObject
# (see ::RBMark::DOM::DOMObject.parse)
def self.parse(text)
cleanup(super(text))
end
# Clean up internal text chunks
def self.cleanup(paragraph)
previous = nil
delete = []
paragraph.children.each_with_index do |child, index|
if previous.nil? || previous.is_a?(::RBMark::DOM::InlineBreak)
if child.is_a? ::RBMark::DOM::Text
child.content = child.content.lstrip
end
delete.append(index) if child.is_a? ::RBMark::DOM::InlineBreak
end
previous = child
end
delete.reverse_each do |index|
paragraph.delete_at(index)
end
paragraph
end
end
# Bold text
class InlineBold < InlineFormattable
self.begin = /(?<!\\)\*\*(?!\*\*)/
self.end = /\*\*.+?\*\*/
# (see ::RBMark::DOM::DOMObject.parse)
def self.parse(text)
super(text[2..-3])
end
end
# Italics text
class InlineItalics < InlineFormattable
self.begin = /(?<!\\)\*(?!\*)/
self.end = /\*.+?\*/
# (see ::RBMark::DOM::DOMObject.parse)
def self.parse(text)
super(text[1..-2])
end
end
# Inline italics text (alternative)
class InlineAltItalics < InlineFormattable
self.begin = /(?<!\\)_(?!_)/
self.end = /_.+?_/
self.alt_for = ::RBMark::DOM::InlineItalics
# (see ::RBMark::DOM::DOMObject.parse)
def self.parse(text)
super(text[1..-2])
end
end
# Underline text
class InlineUnder < InlineFormattable
self.begin = /(?<!\\)__(?!__)/
self.end = /__.+?__/
# (see ::RBMark::DOM::DOMObject.parse)
def self.parse(text)
super(text[2..-3])
end
end
# Strikethrough text
class InlineStrike < InlineFormattable
self.begin = /(?<!\\)~~(?!~~)/
self.end = /~~.+?~~/
# (see ::RBMark::DOM::DOMObject.parse)
def self.parse(text)
super(text[2..-3])
end
end
# Hyperreferenced text
class InlineLink < InlineFormattable
self.begin = /(?<![!\\])\[[^\]]+?\]\([^)]+?\)/
self.end = self.begin
# (see ::RBMark::DOM::DOMObject.parse)
def self.parse(text)
text, link = text.match(/\[([^\]]+?)\]\(([^)]+?)\)/)[1..2]
instance = super(text)
instance.property link: link
instance
end
end
# Image
class InlineImage < InlinePre
self.begin = /(?<!\\)!\[[^\]]+?\]\([^)]+?\)/
self.end = self.begin
# (see ::RBMark::DOM::DOMObject.parse)
def self.parse(text)
text, link = text.match(/!\[([^\]]+?)\]\(([^)]+?)\)/)[1..2]
instance = super(text)
instance.property link: link
instance
end
end
# Linebreak
class InlineBreak < DOMObject
self.begin = / /
self.end = / /
# (see ::RBMark::DOM::DOMObject.parse)
def self.parse(_text)
new
end
# Stub for inline break length
def length
0
end
end
# Document root
class Document < DOMObject
# (see ::RBMark::DOM::DOMObject.parse)
def self.parse(text)
merge(vacuum(super(normalize_newlines(text))))
end
# Replace all forms of line endings with UNIX format newline
def self.normalize_newlines(text)
text.gsub(/(?:\r\n|\n\r|\r|\n)/, "\n")
end
# Remove all elements with absolute lenght of 0
def self.vacuum(document)
delete = []
document.children.each_with_index do |element, index|
delete.append(index) unless element.length.positive?
end
delete.reverse_each do |index|
document.delete_at(index)
end
document
end
# Merge adjacent lists with indent blocks as per markdownguide guidelines
def self.merge(document)
last_list = nil
delete_deferred = []
document.children.each_with_index do |child, index|
if !last_list and [::RBMark::DOM::ULBlock,
::RBMark::DOM::OLBlock].include? child.class
last_list = child
elsif last_list and mergeable?(last_list, child)
merge_adjacent(last_list, child)
delete_deferred.append(index)
else
last_list = nil
end
end
delete_deferred.reverse_each { |index| document.delete_at(index) }
document
end
# Check if 2 elements can be merged
def self.mergeable?(list, child)
if child.is_a? ::RBMark::DOM::IndentBlock or
(child.is_a? ::RBMark::DOM::ULBlock and
list.is_a? ::RBMark::DOM::ULBlock) or
(child.is_a? ::RBMark::DOM::OLBlock and
list.is_a? ::RBMark::DOM::OLBlock and
child.children.first.properties[:number] >
list.children.last.properties[:number])
true
else
false
end
end
# Merge 2 elements
def self.merge_adjacent(left, right)
if right.is_a? ::RBMark::DOM::ULBlock or
right.is_a? ::RBMark::DOM::OLBlock
right.children.each do |child|
left.append(child)
end
elsif right.is_a? ::RBMark::DOM::IndentBlock
left.children.last.append(
*::RBMark::DOM::ListElement.parse(right.children.first.content)
.children
)
end
end
end
# Paragraph in a document (separated by 2 newlines)
class Paragraph < InlineFormattable
# (see ::RBMark::DOM::DOMObject.parse)
# @return [Array<self>]
def self.parse(text)
text.split("\n\n").map do |chunk|
super(chunk)
end
end
end
# Heading level 1
class Heading1 < InlineFormattable
self.begin = /^# /
self.end = /#?$/
# (see ::RBMark::DOM::DOMObject.parse)
def self.parse(text)
super(text.gsub(self.begin, '').gsub(self.end, ''))
end
end
# Heading level 2
class Heading2 < Heading1
self.begin = /^## /
self.end = /(?:##)?$/
end
# Heading level 3
class Heading3 < Heading1
self.begin = /^### /
self.end = /(?:###)?$/
end
# Heading level 4
class Heading4 < Heading1
self.begin = /^#### /
self.end = /(?:####)?$/
end
# Heading level 5
class Heading5 < Heading1
self.begin = /^##### /
self.end = /(?:#####)?$/
end
# Heading level 6
class Heading6 < Heading1
self.begin = /^###### /
self.end = /(?:######)?$/
end
# Alternative heading 1
class AltHeading1 < InlineFormattable
self.begin = /^[^\n]+\n={3,}$/m
self.end = /={3,}$/
self.alt_for = ::RBMark::DOM::Heading1
# (see ::RBMark::DOM::DOMObject.parse)
def self.parse(text)
super(text.match(/\A[^\n]+$/)[0].strip)
end
end
# Alternative heading 2
class AltHeading2 < InlineFormattable
self.begin = /^[^\n]+\n-{3,}$/m
self.end = /-{3,}$/
self.alt_for = ::RBMark::DOM::Heading2
# (see ::RBMark::DOM::DOMObject.parse)
def self.parse(text)
super(text.match(/\A[^\n]+$/)[0].strip)
end
end
# Preformatted code block
class CodeBlock < DOMObject
self.begin = /^```[^\n]*$/
self.end = /^```[^\n]*\n.*?\n```$/m
# Stub parser for block text element
# @param text [String]
# @return [self]
def self.parse(text)
instance = new
language = text.match(/\A```([^\n]*)/)[1].strip
element = ::RBMark::DOM::Text.new
element.content = text.lines[1..-2].join('').rstrip
instance.append(element)
instance.property language: language
instance
end
end
# Quote block
class QuoteBlock < Document
self.begin = /^> \S/
self.end = /(?:^(?!>)|\Z)/
# stub
def self.parse(text)
super(text.lines.map { |x| x[2..] }.join(''))
end
end
# Table
class TableBlock < DOMObject
end
# List element
class ListElement < Document
end
# Unordered list
class ULBlock < DOMObject
self.begin = /^- +\S+/
self.end = /(?:^(?!- +\S+| )|\Z)/
# (see RBMark::DOM::DOMObject.parse)
def self.parse(text)
block = []
instance = new
text.lines.each do |line|
if line.start_with?("- ")
unless block.empty?
instance.append(::RBMark::DOM::ListElement.parse(block.join('')))
end
block = [line[2..]]
else
block.append(line[2..])
end
end
instance.append(::RBMark::DOM::ListElement.parse(block.join('')))
instance
end
end
# Ordered list block
class OLBlock < DOMObject
self.begin = /^\d+\. +\S+/
self.end = /(?:^(?!\d+\. +\S+| {4})|\Z)/
# (see RBMark::DOM::DOMObject.parse)
def self.parse(text)
block = []
instance = new
counter = nil
text.lines.each do |line|
if line.start_with?(/^\d+\. /)
unless block.empty?
instance.append(element(block.join(''), counter))
end
counter = line.match(/^(\d+)\. /)[1]
block = [line.gsub(/^(?:\d+\. | {4})/, '')]
else
block.append(line.gsub(/^(?:\d+\. | {4})/, ''))
end
end
instance.append(element(block.join(''), counter))
instance
end
# Construct a new ListElement
def self.element(text, counter)
instance = ::RBMark::DOM::ListElement.parse(text)
instance.property number: counter
instance
end
end
# Indent block
class IndentBlock < DOMObject
self.begin = /^ {4}/
self.end = /(?:^(?! {4})|\Z)/
# (see RBMark::DOM::DOMObject.parse)
def self.parse(text)
instance = new
element = ::RBMark::DOM::Text.new
element.content = text.lines.map { |x| x[4..] }.join('')
instance.append(element)
instance
end
end
# Horizontal rule
class HorizontalRule < DOMObject
self.begin = /^-{3,}$/
self.end = /$/
# stub for HR
def self.parse(_text)
new
end
# Stub for HR length
# @return [Integer]
def length
1
end
end
InlineFormattable.class_exec do
default ::RBMark::DOM::Text
variant ::RBMark::DOM::InlineBold
variant ::RBMark::DOM::InlineItalics
variant ::RBMark::DOM::InlineAltItalics
variant ::RBMark::DOM::InlineUnder
variant ::RBMark::DOM::InlineImage
variant ::RBMark::DOM::InlineLink
variant ::RBMark::DOM::InlinePre
variant ::RBMark::DOM::InlineStrike
variant ::RBMark::DOM::InlineBreak
end
Document.class_exec do
default ::RBMark::DOM::Paragraph
variant ::RBMark::DOM::Heading1
variant ::RBMark::DOM::Heading2
variant ::RBMark::DOM::Heading3
variant ::RBMark::DOM::Heading4
variant ::RBMark::DOM::Heading5
variant ::RBMark::DOM::Heading6
variant ::RBMark::DOM::AltHeading1
variant ::RBMark::DOM::AltHeading2
variant ::RBMark::DOM::QuoteBlock
variant ::RBMark::DOM::CodeBlock
variant ::RBMark::DOM::ULBlock
variant ::RBMark::DOM::OLBlock
variant ::RBMark::DOM::IndentBlock
variant ::RBMark::DOM::HorizontalRule
end
end
end