summaryrefslogtreecommitdiffstats
path: root/parsers/Paragraphs.java
blob: 02d6de692a9237c8629295c649a10752adbfd833 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
package scruf.parsers;

import java.util.regex.*;

public class Paragraphs implements Parser {
    private String paragraph = "<p>\n$0</p>\n";
    public String parse(String fileContent) {
	/**
	 * This regex contains two parts seperated by a '|'; the first
	 * part is regex for a html Heading (See Heading.java) and the
	 * second part is the regex for a paragraph. For an input, if
	 * the first part of the regex is matched, then it is
	 * necessarily a Heading, so, we ignore it; but if the second
	 * part of the regex is matched for an input, then it is a
	 * paragraph, so, we put the necessary tags in place.
	 */
	Pattern pattern = Pattern.compile("(^.+$\\n)+",Pattern.MULTILINE);
	/**
	 * This htmlTagPattern has a regex to deduct a html tag.
	 */
	Pattern htmlTagPattern = Pattern.compile("^\\<.+?\\>\\n");
	Matcher matcher = pattern.matcher(fileContent);
	Matcher htmlTag;
	StringBuffer sbuffer = new StringBuffer();
	while(matcher.find()) {
	    /**
	     * give the paragraph that is identified htmlTagPattern
	     * and see whether the "paragraph" that is actually
	     * deducted is some other html block like <h1> (heading)
	     * or <blockquote>, etcetera.  "matcher.find()" has
	     * actually found a html block then we don't need to do
	     * the conversion.
	     */
	    htmlTag = htmlTagPattern.matcher(matcher.group());
	    /**
	     * if "matcher.find()" _has not_ deducted a html block,
	     * then we do the conversion.
	     */
	    if(!htmlTag.find()) {
		matcher.appendReplacement(sbuffer,paragraph);
	    }else {
		System.out.println("MATCH$"+matcher.group()+" MATCH$$");
	    }
	}
	matcher.appendTail(sbuffer);
	return sbuffer.toString();
    }
}